diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml index f1435f0ccc..b6a4649a30 100644 --- a/.github/workflows/conda-package.yml +++ b/.github/workflows/conda-package.yml @@ -102,6 +102,8 @@ jobs: run: | echo "WHEELS_OUTPUT_FOLDER=$GITHUB_WORKSPACE${{ runner.os == 'Linux' && '/' || '\\' }}" >> $GITHUB_ENV - name: Build conda package + env: + OVERRIDE_INTEL_IPO: 1 # IPO requires more resources that GH actions VM provides run: conda build --no-test --python ${{ matrix.python }} -c intel -c conda-forge --override-channels conda-recipe - name: Upload artifact uses: actions/upload-artifact@v3 @@ -181,6 +183,7 @@ jobs: python -c "import dpctl; dpctl.lsplatform(verbosity=2)" - name: Install gdb run: | + sudo apt-get update --fix-missing sudo apt-get install -y gdb - name: Run test_elementwise under gdb run: | @@ -320,15 +323,22 @@ jobs: matrix: python: ['3.9', '3.10', '3.11'] steps: - - name: Download artifact + - name: Download conda artifact uses: actions/download-artifact@v3 with: name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }} + - name: Download wheel artifact + uses: actions/download-artifact@v3 + with: + name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Wheels Python ${{ matrix.python }} + - name: Install anaconda-client run: conda install anaconda-client - name: Add conda to system path run: echo $CONDA/bin >> $GITHUB_PATH + - name: Package version + run: echo "PACKAGE_VERSION=$(basename ${{ env.PACKAGE_NAME }}-*.tar.bz2 | sed 's/^${{ env.PACKAGE_NAME }}-\([^-]*\).*/\1/')" >> $GITHUB_ENV - name: Upload env: @@ -339,7 +349,7 @@ jobs: - name: Upload Wheels env: ANACONDA_TOKEN: ${{ secrets.ANACONDA_TOKEN }} - run: anaconda --token ${{ env.ANACONDA_TOKEN }} upload --user dppy --label dev ${{ env.PACKAGE_NAME }}-*.whl + run: anaconda --token ${{ env.ANACONDA_TOKEN }} upload --user dppy --label dev ${{ env.PACKAGE_NAME }}-*.whl --version ${{ env.PACKAGE_VERSION }} upload_windows: needs: test_windows @@ -353,13 +363,24 @@ jobs: uses: actions/download-artifact@v3 with: name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }} + + - name: Download wheel artifact + uses: actions/download-artifact@v3 + with: + name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Wheels Python ${{ matrix.python }} + - uses: conda-incubator/setup-miniconda@v2 with: auto-activate-base: true activate-environment: "" + - name: Install anaconda-client run: conda install anaconda-client + - name: Package version + shell: bash -el {0} + run: echo "PACKAGE_VERSION=$(basename ${{ env.PACKAGE_NAME }}-*.tar.bz2 | sed 's/^${{ env.PACKAGE_NAME }}-\([^-]*\).*/\1/')" >> $GITHUB_ENV + - name: Upload env: ANACONDA_TOKEN: ${{ secrets.ANACONDA_TOKEN }} @@ -369,7 +390,37 @@ jobs: - name: Upload Wheels env: ANACONDA_TOKEN: ${{ secrets.ANACONDA_TOKEN }} - run: anaconda --token ${{ env.ANACONDA_TOKEN }} upload --user dppy --label dev ${{ env.PACKAGE_NAME }}-*.whl + run: anaconda --token ${{ env.ANACONDA_TOKEN }} upload --user dppy --label dev ${{ env.PACKAGE_NAME }}-*.whl --version ${{ env.PACKAGE_VERSION }} + + cleanup_packages: + name: Clean up anaconda packages + needs: [upload_linux, upload_windows] + runs-on: 'ubuntu-latest' + defaults: + run: + shell: bash -el {0} + steps: + - uses: conda-incubator/setup-miniconda@v2 + with: + run-post: false + channel-priority: "disabled" + channels: conda-forge + python-version: '3.11' + + - name: Install anaconda-client + run: conda install anaconda-client + + - name: Checkout repo + uses: actions/checkout@v2 + with: + repository: IntelPython/devops-tools + fetch-depth: 0 + + - name: Cleanup old packages + run: | + python scripts/cleanup-old-packages.py \ + --verbose --force --token ${{ secrets.ANACONDA_TOKEN }} \ + --package dppy/${{ env.PACKAGE_NAME }} --label dev test_examples_linux: needs: build_linux @@ -615,7 +666,7 @@ jobs: python -c "import dpctl; dpctl.lsplatform()" export ARRAY_API_TESTS_MODULE=dpctl.tensor cd /home/runner/work/array-api-tests - pytest --ci --json-report --json-report-file=$FILE array_api_tests/ || true + pytest --json-report --json-report-file=$FILE array_api_tests/ || true - name: Set Github environment variables shell: bash -l {0} run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index b91813feda..eb53db12ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,7 @@ cmake_minimum_required(VERSION 3.21...3.27 FATAL_ERROR) project(dpctl + VERSION 0.15 LANGUAGES CXX DESCRIPTION "Python interface for XPU programming" ) @@ -17,7 +18,7 @@ option(DPCTL_GENERATE_COVERAGE OFF ) -find_package(IntelDPCPP REQUIRED PATHS ${CMAKE_SOURCE_DIR}/cmake NO_DEFAULT_PATH) +find_package(IntelSYCL REQUIRED PATHS ${CMAKE_SOURCE_DIR}/cmake NO_DEFAULT_PATH) add_subdirectory(libsyclinterface) diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index d72914b30b..0000000000 --- a/MANIFEST.in +++ /dev/null @@ -1,23 +0,0 @@ -recursive-include dpctl/include *.h -recursive-include dpctl/include *.hpp -include dpctl/include/dpctl4pybind11.hpp -recursive-include dpctl *.pxd -recursive-include dpctl *.cmake -include dpctl/_sycl_context.h -include dpctl/_sycl_context_api.h -include dpctl/_sycl_device.h -include dpctl/_sycl_device_api.h -include dpctl/_sycl_queue.h -include dpctl/_sycl_queue_api.h -include dpctl/_sycl_event.h -include dpctl/_sycl_event_api.h -include dpctl/memory/_memory.h -include dpctl/memory/_memory_api.h -include dpctl/program/_program.h -include dpctl/program/_program_api.h -include dpctl/tensor/_usmarray.h -include dpctl/tensor/_usmarray_api.h -recursive-include dpctl/tensor/include * -recursive-include dpctl/tensor/libtensor/include * -include dpctl/tests/input_files/* -include dpctl/tests/*.pyx diff --git a/cmake/IntelDPCPPConfig.cmake b/cmake/IntelSYCLConfig.cmake old mode 100644 new mode 100755 similarity index 64% rename from cmake/IntelDPCPPConfig.cmake rename to cmake/IntelSYCLConfig.cmake index 37d79a3ec1..c51e47290c --- a/cmake/IntelDPCPPConfig.cmake +++ b/cmake/IntelSYCLConfig.cmake @@ -1,5 +1,5 @@ # -# Modifications, Copyright (C) 2021 Intel Corporation +# Modifications, Copyright (C) 2022 Intel Corporation # # This software and the related documents are Intel copyrighted materials, and # your use of them is governed by the express license under which they were @@ -15,10 +15,10 @@ # file Copyright.txt or https://cmake.org/licensing for details. #[=======================================================================[.rst: -IntelDPCPPConfig +IntelSYCLConfig ------- -DPCPP Library to verify DPCPP/SYCL compatability of CMAKE_CXX_COMPILER +Library to verify SYCL compatability of CMAKE_CXX_COMPILER and passes relevant compiler flags. Result Variables @@ -26,8 +26,8 @@ Result Variables This will define the following variables: -``IntelDPCPP_FOUND`` - True if the system has the DPCPP library. +``IntelSYCL_FOUND`` + True if the system has the SYCL library. ``SYCL_LANGUAGE_VERSION`` The SYCL language spec version by Compiler. ``SYCL_INCLUDE_DIR`` @@ -37,35 +37,39 @@ This will define the following variables: ``SYCL_FLAGS`` SYCL specific flags for the compiler. +``IntelSYCL::SYCL_CXX`` + Target for using Intel SYCL (DPC++). The following properties are defined + for the target: ``INTERFACE_COMPILE_OPTIONS``, ``INTERFACE_LINK_OPTIONS``, + ``INTERFACE_INCLUDE_DIRECTORIES``, and ``INTERFACE_LINK_DIRECTORIES`` + Cache Variables ^^^^^^^^^^^^^^^ -The following cache variables may also be set: +The following cache variable may also be set: -``SYCL_INCLUDE_DIR`` - The directory containing ``sycl.hpp``. -``SYCL_LIBRARY_DIR`` - The path to the SYCL library. -``SYCL_FLAGS`` - SYCL specific flags for the compiler. ``SYCL_LANGUAGE_VERSION`` The SYCL language spec version by Compiler. -.. note:: +.. Note:: - For now, user needs to set -DCMAKE_CXX_COMPILER or environment of + 1. User needs to set -DCMAKE_CXX_COMPILER or environment of CXX pointing to SYCL compatible compiler ( eg: icx, clang++, icpx) - Note: do not set to DPCPP compiler. If set to a Compiler family - that supports dpcpp ( eg: IntelLLVM) both DPCPP and SYCL - features are enabled. - And add this package to user's Cmake config file. + 2. Add this package to user's Cmake config file. + + .. code-block:: cmake + + find_package(IntelSYCL REQUIRED) + + 3. Add sources to target through add_sycl_to_target() .. code-block:: cmake - find_package(IntelDPCPP REQUIRED) + # Compile specific sources for SYCL and build target for SYCL + add_executable(target_proj A.cpp B.cpp offload1.cpp offload2.cpp) + add_sycl_to_target(TARGET target_proj SOURCES offload1.cpp offload2.cpp) #]=======================================================================] @@ -83,25 +87,33 @@ endif() string(COMPARE EQUAL "${CMAKE_CXX_COMPILER}" "" nocmplr) if(nocmplr) - set(IntelDPCPP_FOUND False) + set(IntelSYCL_FOUND False) set(SYCL_REASON_FAILURE "SYCL: CMAKE_CXX_COMPILER not set!!") - set(IntelDPCPP_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}") + set(IntelSYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}") +endif() + +# Check if a Compiler ID is being set. project() should be set prior to find_package() + +if("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "x") + set(IntelSYCL_FOUND False) + set(SYCL_REASON_FAILURE "CMake CXX Compiler family is not set. Please make sure find_package(IntelSYCL) is called after project()!!") + set(IntelSYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}") + return() endif() # Check for known compiler family that supports SYCL if( NOT "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xClang" AND NOT "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xIntelLLVM") - set(IntelDPCPP_FOUND False) + set(IntelSYCL_FOUND False) set(SYCL_REASON_FAILURE "Unsupported compiler family ${CMAKE_CXX_COMPILER_ID} and compiler ${CMAKE_CXX_COMPILER}!!") - set(IntelDPCPP_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}") + set(IntelSYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}") return() endif() # Assume that CXX Compiler supports SYCL and then test to verify. set(SYCL_COMPILER ${CMAKE_CXX_COMPILER}) - # Function to write a test case to verify SYCL features. function(SYCL_FEATURE_TEST_WRITE src) @@ -144,7 +156,7 @@ function(SYCL_FEATURE_TEST_BUILD TEST_SRC_FILE TEST_EXE) OUTPUT_VARIABLE output ERROR_VARIABLE output OUTPUT_FILE ${SYCL_TEST_DIR}/Compile.log RESULT_VARIABLE result - TIMEOUT 20 + TIMEOUT 60 ) # Verify if test case build properly. @@ -168,12 +180,12 @@ function(SYCL_FEATURE_TEST_RUN TEST_EXE) WORKING_DIRECTORY ${SYCL_TEST_DIR} OUTPUT_VARIABLE output ERROR_VARIABLE output RESULT_VARIABLE result - TIMEOUT 20 + TIMEOUT 60 ) # Verify the test execution output. if(test_result) - set(IntelDPCPP_FOUND False) + set(IntelSYCL_FOUND False) set(SYCL_REASON_FAILURE "SYCL: feature test execution failed!!") endif() # TODO: what iff the result is false.. error or ignore? @@ -236,14 +248,14 @@ set(SYCL_LINK_FLAGS "") # Based on Compiler ID, add support for SYCL if( "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xClang" OR "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xIntelLLVM") - set(SYCL_FLAGS "-fsycl ") - set(SYCL_LINK_FLAGS "-fsycl ") + list(APPEND SYCL_FLAGS "-fsycl") + list(APPEND SYCL_LINK_FLAGS "-fsycl") endif() # TODO verify if this is needed # Windows: Add Exception handling if(WIN32) - set(SYCL_FLAGS "${SYCL_FLAGS} /EHsc") + list(APPEND SYCL_FLAGS "/EHsc") endif() set(SYCL_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SYCL_FLAGS}") @@ -273,32 +285,76 @@ SYCL_FEATURE_TEST_EXTRACT(${test_output}) # define macro SYCL_LANGUAGE_VERSION string(COMPARE EQUAL "${SYCL_LANGUAGE_VERSION}" "" nosycllang) if(nosycllang) - set(IntelDPCPP_FOUND False) + set(IntelSYCL_FOUND False) set(SYCL_REASON_FAILURE "SYCL: It appears that the ${CMAKE_CXX_COMPILER} does not support SYCL") - set(IntelDPCPP_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}") + set(IntelSYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}") endif() # Placeholder for identifying various implemenations of SYCL compilers. # for now, set to the CMAKE_CXX_COMPILER_ID set(SYCL_IMPLEMENTATION_ID "${CMAKE_CXX_COMPILER_ID}") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SYCL_FLAGS}") -set(CMAKE_CXX_LINK_FLAGS "${CMAKE_CXX_LINK_FLAGS} ${SYCL_LINK_FLAGS}") +message(DEBUG "The SYCL compiler is ${SYCL_COMPILER}") +message(DEBUG "The SYCL Flags are ${SYCL_FLAGS}") +message(DEBUG "The SYCL Language Version is ${SYCL_LANGUAGE_VERSION}") -message(STATUS "Echo from ${CMAKE_CURRENT_SOURCE_DIR}/IntelDPCPPConfig.cmake") -message(STATUS "The SYCL compiler is ${SYCL_COMPILER}") -message(STATUS "The SYCL Flags are ${SYCL_FLAGS}") -message(STATUS "The SYCL Language Version is ${SYCL_LANGUAGE_VERSION}") +add_library(IntelSYCL::SYCL_CXX INTERFACE IMPORTED) +set_property(TARGET IntelSYCL::SYCL_CXX PROPERTY + INTERFACE_COMPILE_OPTIONS ${SYCL_FLAGS}) +set_property(TARGET IntelSYCL::SYCL_CXX PROPERTY + INTERFACE_LINK_OPTIONS ${SYCL_LINK_FLAGS}) +set_property(TARGET IntelSYCL::SYCL_CXX PROPERTY + INTERFACE_INCLUDE_DIRECTORIES ${SYCL_INCLUDE_DIR}) +set_property(TARGET IntelSYCL::SYCL_CXX PROPERTY + INTERFACE_LINK_DIRECTORIES ${SYCL_LIBRARY_DIR}) find_package_handle_standard_args( - IntelDPCPP - FOUND_VAR IntelDPCPP_FOUND + IntelSYCL + FOUND_VAR IntelSYCL_FOUND REQUIRED_VARS SYCL_INCLUDE_DIR SYCL_LIBRARY_DIR SYCL_FLAGS VERSION_VAR SYCL_LANGUAGE_VERSION REASON_FAILURE_MESSAGE "${SYCL_REASON_FAILURE}") # Include in Cache set(SYCL_LANGUAGE_VERSION "${SYCL_LANGUAGE_VERSION}" CACHE STRING "SYCL Language version") -set(SYCL_INCLUDE_DIR "${SYCL_INCLUDE_DIR}" CACHE FILEPATH "SYCL Include directory") -set(SYCL_LIBRARY_DIR "${SYCL_LIBRARY_DIR}" CACHE FILEPATH "SYCL Library Directory") -set(SYCL_FLAGS "${SYCL_FLAGS}" CACHE STRING "SYCL flags for the compiler") + +function(add_sycl_to_target) + + set(one_value_args TARGET) + set(multi_value_args SOURCES) + cmake_parse_arguments(SYCL + "" + "${one_value_args}" + "${multi_value_args}" + ${ARGN}) + + + get_target_property(__sycl_cxx_options IntelSYCL::SYCL_CXX INTERFACE_COMPILE_OPTIONS) + get_target_property(__sycl_cxx_include_directories IntelSYCL::SYCL_CXX INTERFACE_INCLUDE_DIRECTORIES) + + if(NOT ${ARGC}) + message(FATAL_ERROR " add_sycl_to_target() does not have any arguments") + elseif(${ARGC} EQUAL 1) + message(WARNING "add_sycl_to_target() have only one argument specified.. assuming the target to be ${ARGV}. +Adding sycl to all sources but that may effect compilation times") + set(SYCL_TARGET ${ARGV}) + endif() + + if(NOT SYCL_SOURCES) + message(WARNING "add_sycl_to_target() does not have sources specified.. Adding sycl to all sources but that may effect compilation times") + target_compile_options(${SYCL_TARGET} PUBLIC ${__sycl_cxx_options}) + target_include_directories(${SYCL_TARGET} PUBLIC ${__sycl_cxx_include_directories}) + endif() + + foreach(source ${SYCL_SOURCES}) + set_source_files_properties(${source} PROPERTIES COMPILE_OPTIONS "${__sycl_cxx_options}") + set_source_files_properties(${source} PROPERTIES INCLUDE_DIRECTORIES "${__sycl_cxx_include_directories}") + endforeach() + + get_target_property(__sycl_link_options + IntelSYCL::SYCL_CXX INTERFACE_LINK_OPTIONS) + target_link_options(${SYCL_TARGET} PRIVATE "${__sycl_link_options}") + get_target_property(__sycl_link_directories + IntelSYCL::SYCL_CXX INTERFACE_LINK_DIRECTORIES) + target_link_directories(${SYCL_TARGET} PUBLIC "${__sycl_link_directories}") +endfunction(add_sycl_to_target) diff --git a/conda-recipe/bld.bat b/conda-recipe/bld.bat index 07fa580bb4..e87254b446 100644 --- a/conda-recipe/bld.bat +++ b/conda-recipe/bld.bat @@ -6,6 +6,11 @@ set "INCLUDE=%BUILD_PREFIX%\include;%INCLUDE%" "%PYTHON%" setup.py clean --all set "SKBUILD_ARGS=-G Ninja -- -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_CXX_COMPILER:PATH=icx -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON" +REM Overriding IPO is useful for building in resources constrained VMs (public CI) +if DEFINED OVERRIDE_INTEL_IPO ( + set "SKBUILD_ARGS=%SKBUILD_ARGS% -DCMAKE_INTERPROCEDURAL_OPTIMIZATION:BOOL=FALSE" +) + FOR %%V IN (14.0.0 14 15.0.0 15 16.0.0 16 17.0.0 17) DO @( REM set DIR_HINT if directory exists IF EXIST "%BUILD_PREFIX%\Library\lib\clang\%%V\" ( @@ -29,7 +34,7 @@ if EXIST "%PLATFORM_DIR%" ( if NOT "%WHEELS_OUTPUT_FOLDER%"=="" ( rem Install and assemble wheel package from the build bits - "%PYTHON%" setup.py install bdist_wheel %SKBUILD_ARGS% + "%PYTHON%" setup.py install bdist_wheel --build-number %GIT_DESCRIBE_NUMBER% %SKBUILD_ARGS% if errorlevel 1 exit 1 copy dist\dpctl*.whl %WHEELS_OUTPUT_FOLDER% if errorlevel 1 exit 1 diff --git a/conda-recipe/build.sh b/conda-recipe/build.sh index 87155d3fb3..f21660ec50 100755 --- a/conda-recipe/build.sh +++ b/conda-recipe/build.sh @@ -17,11 +17,7 @@ echo "${PYTHON} setup.py install ${SKBUILD_ARGS}" if [ -n "${WHEELS_OUTPUT_FOLDER}" ]; then # Install packages and assemble wheel package from built bits - if [ "$CONDA_PY" == "36" ]; then - WHEELS_BUILD_ARGS="-p manylinux1_x86_64" - else - WHEELS_BUILD_ARGS="-p manylinux2014_x86_64" - fi + WHEELS_BUILD_ARGS="-p manylinux2014_x86_64 --build-number ${GIT_DESCRIBE_NUMBER}" ${PYTHON} setup.py install bdist_wheel ${WHEELS_BUILD_ARGS} ${SKBUILD_ARGS} cp dist/dpctl*.whl ${WHEELS_OUTPUT_FOLDER} else diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 7b6f071610..2806fb9262 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -9,6 +9,7 @@ build: number: {{ GIT_DESCRIBE_NUMBER }} script_env: - WHEELS_OUTPUT_FOLDER + - OVERRIDE_INTEL_IPO # [win] requirements: build: diff --git a/dpctl/CMakeLists.txt b/dpctl/CMakeLists.txt index a466d3eef1..cb872ff45f 100644 --- a/dpctl/CMakeLists.txt +++ b/dpctl/CMakeLists.txt @@ -58,7 +58,6 @@ elseif(UNIX) string(CONCAT CXXFLAGS "${WARNING_FLAGS}" "${SDL_FLAGS}" - "-fsycl " ) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 ${CFLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 ${CXXFLAGS}") @@ -137,10 +136,15 @@ add_custom_target( set(CMAKE_INSTALL_RPATH "$ORIGIN") function(build_dpctl_ext _trgt _src _dest) + set(options SYCL) + cmake_parse_arguments(BUILD_DPCTL_EXT "${options}" "" "" ${ARGN}) add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src) set(_cythonize_trgt "${_trgt}_cythonize_pyx") add_custom_target(${_cythonize_trgt} DEPENDS ${_src}) - python_add_library(${_trgt} MODULE ${_generated_src}) + Python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src}) + if (BUILD_DPCTL_EXT_SYCL) + add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src}) + endif() target_include_directories(${_trgt} PRIVATE ${NumPy_INCLUDE_DIR} ${DPCTL_INCLUDE_DIR}) add_dependencies(${_trgt} _build_time_create_dpctl_include_copy ${_cythonize_trgt}) if (DPCTL_GENERATE_COVERAGE) @@ -185,13 +189,18 @@ function(build_dpctl_ext _trgt _src _dest) install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest}) endfunction() -file(GLOB _cython_sources *.pyx) +file(GLOB _cython_sources ${CMAKE_CURRENT_SOURCE_DIR}/*.pyx) +list(REMOVE_ITEM _cython_sources ${CMAKE_CURRENT_SOURCE_DIR}/_sycl_queue.pyx) foreach(_cy_file ${_cython_sources}) get_filename_component(_trgt ${_cy_file} NAME_WLE) build_dpctl_ext(${_trgt} ${_cy_file} "dpctl") endforeach() -target_include_directories(_sycl_queue PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +set(_cy_file ${CMAKE_CURRENT_SOURCE_DIR}/_sycl_queue.pyx) +get_filename_component(_trgt ${_cy_file} NAME_WLE) +build_dpctl_ext(${_trgt} ${_cy_file} "dpctl" SYCL) +# _sycl_queue include _host_task_util.hpp +target_include_directories(${_trgt} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) add_subdirectory(program) add_subdirectory(memory) diff --git a/dpctl/_backend.pxd b/dpctl/_backend.pxd index 3f7ba63a55..57da77eb7d 100644 --- a/dpctl/_backend.pxd +++ b/dpctl/_backend.pxd @@ -403,6 +403,13 @@ cdef extern from "syclinterface/dpctl_sycl_queue_interface.h": void *Dest, const void *Src, size_t Count) + cdef DPCTLSyclEventRef DPCTLQueue_MemcpyWithEvents( + const DPCTLSyclQueueRef Q, + void *Dest, + const void *Src, + size_t Count, + const DPCTLSyclEventRef *depEvents, + size_t depEventsCount) cdef DPCTLSyclEventRef DPCTLQueue_Memset( const DPCTLSyclQueueRef Q, void *Dest, diff --git a/dpctl/_host_task_util.hpp b/dpctl/_host_task_util.hpp index 8db17594fd..cb3828a54f 100644 --- a/dpctl/_host_task_util.hpp +++ b/dpctl/_host_task_util.hpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -29,30 +29,30 @@ /// //===----------------------------------------------------------------------===// +#pragma once #include "Python.h" #include "syclinterface/dpctl_data_types.h" +#include "syclinterface/dpctl_sycl_type_casters.hpp" #include -int async_dec_ref(DPCTLSyclQueueRef QRef, - PyObject **obj_array, - size_t obj_array_size, - DPCTLSyclEventRef *ERefs, - size_t nERefs) +DPCTLSyclEventRef async_dec_ref(DPCTLSyclQueueRef QRef, + PyObject **obj_array, + size_t obj_array_size, + DPCTLSyclEventRef *depERefs, + size_t nDepERefs, + int *status) { + using dpctl::syclinterface::unwrap; + using dpctl::syclinterface::wrap; - sycl::queue *q = reinterpret_cast(QRef); + sycl::queue *q = unwrap(QRef); - std::vector obj_vec; - obj_vec.reserve(obj_array_size); - for (size_t obj_id = 0; obj_id < obj_array_size; ++obj_id) { - obj_vec.push_back(obj_array[obj_id]); - } + std::vector obj_vec(obj_array, obj_array + obj_array_size); try { - q->submit([&](sycl::handler &cgh) { - for (size_t ev_id = 0; ev_id < nERefs; ++ev_id) { - cgh.depends_on( - *(reinterpret_cast(ERefs[ev_id]))); + sycl::event ht_ev = q->submit([&](sycl::handler &cgh) { + for (size_t ev_id = 0; ev_id < nDepERefs; ++ev_id) { + cgh.depends_on(*(unwrap(depERefs[ev_id]))); } cgh.host_task([obj_array_size, obj_vec]() { // if the main thread has not finilized the interpreter yet @@ -66,9 +66,21 @@ int async_dec_ref(DPCTLSyclQueueRef QRef, } }); }); + + constexpr int result_ok = 0; + + *status = result_ok; + auto e_ptr = new sycl::event(ht_ev); + return wrap(e_ptr); } catch (const std::exception &e) { - return 1; + constexpr int result_std_exception = 1; + + *status = result_std_exception; + return nullptr; } - return 0; + constexpr int result_other_abnormal = 2; + + *status = result_other_abnormal; + return nullptr; } diff --git a/dpctl/_sycl_context.pyx b/dpctl/_sycl_context.pyx index 71c2fb7591..a9873fa0de 100644 --- a/dpctl/_sycl_context.pyx +++ b/dpctl/_sycl_context.pyx @@ -48,7 +48,6 @@ from ._backend cimport ( # noqa: E211 error_handler_callback, ) from ._sycl_device cimport SyclDevice -from ._sycl_queue cimport default_async_error_handler from ._sycl_device import SyclDeviceCreationError __all__ = [ @@ -201,8 +200,7 @@ cdef class SyclContext(_SyclContext): cdef int _init_context_from_one_device(self, SyclDevice device, int props): cdef DPCTLSyclDeviceRef DRef = device.get_device_ref() cdef DPCTLSyclContextRef CRef = NULL - cdef error_handler_callback * eh_callback = ( - &default_async_error_handler) + cdef error_handler_callback * eh_callback = NULL # look up cached contexts for root devices first CRef = DPCTLDeviceMgr_GetCachedContext(DRef) if (CRef is NULL): @@ -219,8 +217,7 @@ cdef class SyclContext(_SyclContext): cdef int j = 0 cdef size_t num_bytes cdef DPCTLDeviceVectorRef DVRef = NULL - cdef error_handler_callback * eh_callback = ( - &default_async_error_handler) + cdef error_handler_callback * eh_callback = NULL cdef DPCTLSyclContextRef CRef = NULL cdef DPCTLSyclDeviceRef *elems diff --git a/dpctl/_sycl_event.pyx b/dpctl/_sycl_event.pyx index 34576a2ef7..3b8f89d4c5 100644 --- a/dpctl/_sycl_event.pyx +++ b/dpctl/_sycl_event.pyx @@ -218,7 +218,7 @@ cdef class SyclEvent(_SyclEvent): @staticmethod cdef void _wait(SyclEvent event): - with nogil: DPCTLEvent_WaitAndThrow(event._event_ref) + with nogil: DPCTLEvent_Wait(event._event_ref) @staticmethod def wait_for(event): diff --git a/dpctl/_sycl_queue.pxd b/dpctl/_sycl_queue.pxd index c906ada4d6..0269cc4aae 100644 --- a/dpctl/_sycl_queue.pxd +++ b/dpctl/_sycl_queue.pxd @@ -29,8 +29,6 @@ from ._sycl_event cimport SyclEvent from .program._program cimport SyclKernel -cdef void default_async_error_handler(int) except * nogil - cdef public api class _SyclQueue [ object Py_SyclQueueObject, type Py_SyclQueueType ]: @@ -72,6 +70,19 @@ cdef public api class SyclQueue (_SyclQueue) [ cpdef SyclContext get_sycl_context(self) cpdef SyclDevice get_sycl_device(self) cdef DPCTLSyclQueueRef get_queue_ref(self) + cpdef SyclEvent _submit_keep_args_alive( + self, + object args, + list dEvents + ) + cpdef SyclEvent submit_async( + self, + SyclKernel kernel, + list args, + list gS, + list lS=*, + list dEvents=* + ) cpdef SyclEvent submit( self, SyclKernel kernel, @@ -83,6 +94,7 @@ cdef public api class SyclQueue (_SyclQueue) [ cpdef void wait(self) cdef DPCTLSyclQueueRef get_queue_ref(self) cpdef memcpy(self, dest, src, size_t count) + cpdef SyclEvent memcpy_async(self, dest, src, size_t count, list dEvents=*) cpdef prefetch(self, ptr, size_t count=*) cpdef mem_advise(self, ptr, size_t count, int mem) cpdef SyclEvent submit_barrier(self, dependent_events=*) diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index 6acf3396e1..a27e6f940f 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -45,6 +45,7 @@ from ._backend cimport ( # noqa: E211 DPCTLQueue_IsInOrder, DPCTLQueue_MemAdvise, DPCTLQueue_Memcpy, + DPCTLQueue_MemcpyWithEvents, DPCTLQueue_Prefetch, DPCTLQueue_SubmitBarrierForEvents, DPCTLQueue_SubmitNDRange, @@ -56,7 +57,6 @@ from ._backend cimport ( # noqa: E211 _arg_data_type, _backend_type, _queue_property_type, - error_handler_callback, ) from .memory._memory cimport _Memory @@ -65,6 +65,7 @@ import ctypes from .enum_types import backend_type from cpython cimport pycapsule +from cpython.buffer cimport PyObject_CheckBuffer from cpython.ref cimport Py_DECREF, Py_INCREF, PyObject from libc.stdlib cimport free, malloc @@ -73,7 +74,7 @@ import logging cdef extern from "_host_task_util.hpp": - int async_dec_ref(DPCTLSyclQueueRef, PyObject **, size_t, DPCTLSyclEventRef *, size_t) nogil + DPCTLSyclEventRef async_dec_ref(DPCTLSyclQueueRef, PyObject **, size_t, DPCTLSyclEventRef *, size_t, int *) nogil __all__ = [ @@ -114,18 +115,6 @@ cdef class SyclQueueCreationError(Exception): pass -cdef class SyclAsynchronousError(Exception): - """ - A SyclAsynchronousError exception is raised when SYCL operation submission - or execution encounters an error. - """ - - -cdef void default_async_error_handler(int err) except * nogil: - with gil: - raise SyclAsynchronousError(err) - - cdef int _parse_queue_properties(object prop) except *: cdef int res = 0 cdef object props @@ -173,6 +162,62 @@ cdef void _queue_capsule_deleter(object o) noexcept: DPCTLQueue_Delete(QRef) +cdef bint _is_buffer(object o): + return PyObject_CheckBuffer(o) + + +cdef DPCTLSyclEventRef _memcpy_impl( + SyclQueue q, + object dst, + object src, + size_t byte_count, + DPCTLSyclEventRef *dep_events, + size_t dep_events_count +) except *: + cdef void *c_dst_ptr = NULL + cdef void *c_src_ptr = NULL + cdef DPCTLSyclEventRef ERef = NULL + cdef const unsigned char[::1] src_host_buf = None + cdef unsigned char[::1] dst_host_buf = None + + if isinstance(src, _Memory): + c_src_ptr = (<_Memory>src).memory_ptr + elif _is_buffer(src): + src_host_buf = src + c_src_ptr = &src_host_buf[0] + else: + raise TypeError( + "Parameter `src` should have either type " + "`dpctl.memory._Memory` or a type that " + "supports Python buffer protocol" + ) + + if isinstance(dst, _Memory): + c_dst_ptr = (<_Memory>dst).memory_ptr + elif _is_buffer(dst): + dst_host_buf = dst + c_dst_ptr = &dst_host_buf[0] + else: + raise TypeError( + "Parameter `dst` should have either type " + "`dpctl.memory._Memory` or a type that " + "supports Python buffer protocol" + ) + + if dep_events_count == 0 or dep_events is NULL: + ERef = DPCTLQueue_Memcpy(q._queue_ref, c_dst_ptr, c_src_ptr, byte_count) + else: + ERef = DPCTLQueue_MemcpyWithEvents( + q._queue_ref, + c_dst_ptr, + c_src_ptr, + byte_count, + dep_events, + dep_events_count + ) + return ERef + + cdef class _SyclQueue: """ Barebone data owner class used by SyclQueue. """ @@ -404,7 +449,7 @@ cdef class SyclQueue(_SyclQueue): QRef = DPCTLQueue_Create( CRef, DRef, - &default_async_error_handler, + NULL, props ) if QRef is NULL: @@ -481,7 +526,7 @@ cdef class SyclQueue(_SyclQueue): QRef = DPCTLQueue_Create( CRef, DRef, - &default_async_error_handler, + NULL, props ) if (QRef is NULL): @@ -566,7 +611,7 @@ cdef class SyclQueue(_SyclQueue): qref = DPCTLQueue_Create( cref, dref, - &default_async_error_handler, + NULL, props ) if qref is NULL: @@ -716,7 +761,81 @@ cdef class SyclQueue(_SyclQueue): """ return self._queue_ref - cpdef SyclEvent submit( + + cpdef SyclEvent _submit_keep_args_alive( + self, + object args, + list dEvents + ): + """ SyclQueue._submit_keep_args_alive(args, events) + + Keeps objects in `args` alive until tasks associated with events + complete. + + Args: + args(object): Python object to keep alive. + Typically a tuple with arguments to offloaded tasks + events(Tuple[dpctl.SyclEvent]): Gating events + The list or tuple of events associated with tasks + working on Python objects collected in `args`. + Returns: + dpctl.SyclEvent + The event associated with the submission of host task. + + Increments reference count of `args` and schedules asynchronous + ``host_task`` to decrement the count once dependent events are + complete. + + N.B.: The `host_task` attempts to acquire Python GIL, and it is + known to be unsafe during interpreter shudown sequence. It is + thus strongly advised to ensure that all submitted `host_task` + complete before the end of the Python script. + """ + cdef size_t nDE = len(dEvents) + cdef DPCTLSyclEventRef *depEvents = NULL + cdef PyObject *args_raw = NULL + cdef DPCTLSyclEventRef htERef = NULL + cdef int status = -1 + + # Create the array of dependent events if any + if nDE > 0: + depEvents = ( + malloc(nDE*sizeof(DPCTLSyclEventRef)) + ) + if not depEvents: + raise MemoryError() + else: + for idx, de in enumerate(dEvents): + if isinstance(de, SyclEvent): + depEvents[idx] = (de).get_event_ref() + else: + free(depEvents) + raise TypeError( + "A sequence of dpctl.SyclEvent is expected" + ) + + # increment reference counts to list of arguments + Py_INCREF(args) + + # schedule decrement + args_raw = args + + htERef = async_dec_ref( + self.get_queue_ref(), + &args_raw, 1, + depEvents, nDE, &status + ) + + free(depEvents) + if (status != 0): + with nogil: DPCTLEvent_Wait(htERef) + DPCTLEvent_Delete(htERef) + raise RuntimeError("Could not submit keep_args_alive host_task") + + return SyclEvent._create(htERef) + + + cpdef SyclEvent submit_async( self, SyclKernel kernel, list args, @@ -728,13 +847,14 @@ cdef class SyclQueue(_SyclQueue): cdef _arg_data_type *kargty = NULL cdef DPCTLSyclEventRef *depEvents = NULL cdef DPCTLSyclEventRef Eref = NULL + cdef DPCTLSyclEventRef htEref = NULL cdef int ret = 0 cdef size_t gRange[3] cdef size_t lRange[3] cdef size_t nGS = len(gS) cdef size_t nLS = len(lS) if lS is not None else 0 cdef size_t nDE = len(dEvents) if dEvents is not None else 0 - cdef PyObject **arg_objects = NULL + cdef PyObject *args_raw = NULL cdef ssize_t i = 0 # Allocate the arrays to be sent to DPCTLQueue_Submit @@ -758,7 +878,15 @@ cdef class SyclQueue(_SyclQueue): raise MemoryError() else: for idx, de in enumerate(dEvents): - depEvents[idx] = (de).get_event_ref() + if isinstance(de, SyclEvent): + depEvents[idx] = (de).get_event_ref() + else: + free(kargs) + free(kargty) + free(depEvents) + raise TypeError( + "A sequence of dpctl.SyclEvent is expected" + ) # populate the args and argstype arrays ret = self._populate_args(args, kargs, kargty) @@ -836,50 +964,69 @@ cdef class SyclQueue(_SyclQueue): raise SyclKernelSubmitError( "Kernel submission to Sycl queue failed." ) - # increment reference counts to each argument - arg_objects = malloc(len(args) * sizeof(PyObject *)) - for i in range(len(args)): - arg_objects[i] = (args[i]) - Py_INCREF( arg_objects[i]) - - # schedule decrement - if async_dec_ref(self.get_queue_ref(), arg_objects, len(args), &Eref, 1): - # async task submission failed, decrement ref counts and wait - for i in range(len(args)): - arg_objects[i] = (args[i]) - Py_DECREF( arg_objects[i]) - with nogil: DPCTLEvent_Wait(Eref) - - # free memory - free(arg_objects) return SyclEvent._create(Eref) + cpdef SyclEvent submit( + self, + SyclKernel kernel, + list args, + list gS, + list lS=None, + list dEvents=None + ): + cdef SyclEvent e = self.submit_async(kernel, args, gS, lS, dEvents) + e.wait() + return e + cpdef void wait(self): with nogil: DPCTLQueue_Wait(self._queue_ref) cpdef memcpy(self, dest, src, size_t count): - cdef void *c_dest - cdef void *c_src + """Copy memory from `src` to `dst`""" cdef DPCTLSyclEventRef ERef = NULL - if isinstance(dest, _Memory): - c_dest = (<_Memory>dest).memory_ptr - else: - raise TypeError("Parameter `dest` should have type _Memory.") + ERef = _memcpy_impl(self, dest, src, count, NULL, 0) + if (ERef is NULL): + raise RuntimeError( + "SyclQueue.memcpy operation encountered an error" + ) + with nogil: DPCTLEvent_Wait(ERef) + DPCTLEvent_Delete(ERef) + + cpdef SyclEvent memcpy_async(self, dest, src, size_t count, list dEvents=None): + """Copy memory from `src` to `dst`""" + cdef DPCTLSyclEventRef ERef = NULL + cdef DPCTLSyclEventRef *depEvents = NULL + cdef size_t nDE = 0 - if isinstance(src, _Memory): - c_src = (<_Memory>src).memory_ptr + if dEvents is None: + ERef = _memcpy_impl(self, dest, src, count, NULL, 0) else: - raise TypeError("Parameter `src` should have type _Memory.") + nDE = len(dEvents) + depEvents = ( + malloc(nDE*sizeof(DPCTLSyclEventRef)) + ) + if depEvents is NULL: + raise MemoryError() + else: + for idx, de in enumerate(dEvents): + if isinstance(de, SyclEvent): + depEvents[idx] = (de).get_event_ref() + else: + free(depEvents) + raise TypeError( + "A sequence of dpctl.SyclEvent is expected" + ) + ERef = _memcpy_impl(self, dest, src, count, depEvents, nDE) + free(depEvents) - ERef = DPCTLQueue_Memcpy(self._queue_ref, c_dest, c_src, count) if (ERef is NULL): raise RuntimeError( "SyclQueue.memcpy operation encountered an error" ) - with nogil: DPCTLEvent_Wait(ERef) - DPCTLEvent_Delete(ERef) + + return SyclEvent._create(ERef) cpdef prefetch(self, mem, size_t count=0): cdef void *ptr diff --git a/dpctl/_sycl_timer.py b/dpctl/_sycl_timer.py index 322272df2d..66dd4f9340 100644 --- a/dpctl/_sycl_timer.py +++ b/dpctl/_sycl_timer.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import timeit from . import SyclQueue @@ -22,6 +21,29 @@ __doc__ = "This module implements :class:`dpctl.SyclTimer`." +class HostDeviceDuration: + def __init__(self, host_dt, device_dt): + self._host_dt = host_dt + self._device_dt = device_dt + + def __repr__(self): + return f"(host_dt={self._host_dt}, device_dt={self._device_dt})" + + def __str__(self): + return f"(host_dt={self._host_dt}, device_dt={self._device_dt})" + + def __iter__(self): + yield from [self._host_dt, self._device_dt] + + @property + def host_dt(self): + return self._host_dt + + @property + def device_dt(self): + return self._device_dt + + class SyclTimer: """ SyclTimer(host_timer=timeit.default_timer, time_scale=1) @@ -45,7 +67,7 @@ class SyclTimer: code_block # retrieve elapsed times in milliseconds - sycl_dt, wall_dt = timer.dt + wall_dt, device_dt = timer.dt Remark: The timer submits barriers to the queue at the entrance and the @@ -67,10 +89,8 @@ def __init__(self, host_timer=timeit.default_timer, time_scale=1): self.timer = host_timer self.time_scale = time_scale self.queue = None - self.host_start = None - self.host_finish = None - self.event_start = None - self.event_finish = None + self.host_times = [] + self.bracketing_events = [] def __call__(self, queue=None): if isinstance(queue, SyclQueue): @@ -89,27 +109,31 @@ def __call__(self, queue=None): return self def __enter__(self): - self.event_start = self.queue.submit_barrier() - self.host_start = self.timer() + self._event_start = self.queue.submit_barrier() + self._host_start = self.timer() return self def __exit__(self, *args): - self.event_finish = self.queue.submit_barrier() - self.host_finish = self.timer() + self.host_times.append((self._host_start, self.timer())) + self.bracketing_events.append( + (self._event_start, self.queue.submit_barrier()) + ) + del self._event_start + del self._host_start @property def dt(self): - """Returns a tuple of elapsed times where first - element is the duration as measured by the host timer, - while the second element is the duration as measured by - the device timer and encoded in profiling events""" - self.event_start.wait() - self.event_finish.wait() - return ( - (self.host_finish - self.host_start) * self.time_scale, - ( - self.event_finish.profiling_info_start - - self.event_start.profiling_info_end - ) - * (1e-9 * self.time_scale), - ) + """Returns a pair of elapsed times (host_dt, device_dt). + + The host_dt is the duration as measured by the host + timer, while the device_dt is the duration as measured by + the device timer and encoded in profiling events.""" + for es, ef in self.bracketing_events: + es.wait() + ef.wait() + host_dt = sum(tf - ts for ts, tf in self.host_times) * self.time_scale + dev_dt = sum( + ef.profiling_info_start - es.profiling_info_end + for es, ef in self.bracketing_events + ) * (1e-9 * self.time_scale) + return HostDeviceDuration(host_dt, dev_dt) diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 456eebdbaa..d1de208805 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -30,11 +30,96 @@ if(WIN32) endif() endif() -set(python_module_name _tensor_impl) -pybind11_add_module(${python_module_name} MODULE - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_py.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp +set(_elementwise_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_common.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/abs.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/acos.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/acosh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/add.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/asin.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/asinh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atan2.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atanh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_and.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_invert.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_left_shift.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_or.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_right_shift.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_xor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cbrt.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/ceil.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/conj.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/copysign.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cos.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cosh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/equal.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp2.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/expm1.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor_divide.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater_equal.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/hypot.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/imag.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isfinite.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isinf.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isnan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less_equal.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log1p.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log2.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log10.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logaddexp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_and.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_not.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_or.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_xor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/maximum.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/minimum.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/multiply.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/negative.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/not_equal.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/positive.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/pow.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/proj.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/real.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/remainder.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/round.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/rsqrt.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sign.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/signbit.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sin.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sinh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/square.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/subtract.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tanh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/true_divide.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/trunc.cpp +) +set(_reduction_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduction_common.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmax.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmin.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/max.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/min.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/prod.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp +) +set(_boolean_reduction_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp +) +set(_tensor_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp @@ -46,39 +131,84 @@ pybind11_add_module(${python_module_name} MODULE ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sum_reductions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp ) +set(_tensor_elementwise_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_elementwise.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp + ${_elementwise_sources} +) +set(_tensor_reductions_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_reductions.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp + ${_boolean_reduction_sources} + ${_reduction_sources} +) + +set(_py_trgts) + +set(python_module_name _tensor_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources}) +list(APPEND _py_trgts ${python_module_name}) + +set(python_module_name _tensor_elementwise_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_elementwise_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_elementwise_impl_sources}) +list(APPEND _py_trgts ${python_module_name}) + +set(python_module_name _tensor_reductions_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_reductions_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_reductions_impl_sources}) +list(APPEND _py_trgts ${python_module_name}) + set(_clang_prefix "") if (WIN32) set(_clang_prefix "/clang:") endif() -set_source_files_properties( + +set(_no_fast_math_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp - PROPERTIES COMPILE_OPTIONS "${_clang_prefix}-fno-fast-math") + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp +) +list(APPEND _no_fast_math_sources + ${_elementwise_sources} + ${_reduction_sources} +) + +foreach(_src_fn ${_no_fast_math_sources}) + get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS) + set(_combined_options_prop ${_cmpl_options_prop} "${_clang_prefix}-fno-fast-math") + set_source_files_properties( + ${_src_fn} + PROPERTIES COMPILE_OPTIONS "${_combined_options_prop}" + ) +endforeach() if (UNIX) set_source_files_properties( - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/abs.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp PROPERTIES COMPILE_DEFINITIONS "USE_STD_ABS_FOR_COMPLEX_TYPES;USE_STD_SQRT_FOR_COMPLEX_TYPES") endif() -target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int) -target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel) -if(UNIX) - # this option is supported on Linux only - target_link_options(${python_module_name} PRIVATE -fsycl-link-huge-device-code) -endif() -target_include_directories(${python_module_name} - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/../include - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/ -) + set(_linker_options "LINKER:${DPCTL_LDFLAGS}") -target_link_options(${python_module_name} PRIVATE ${_linker_options}) -add_dependencies(${python_module_name} _dpctl4pybind11_deps) -install(TARGETS ${python_module_name} DESTINATION "dpctl/tensor") +foreach(python_module_name ${_py_trgts}) + target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int) + target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel) + if(UNIX) + # this option is supported on Linux only + target_link_options(${python_module_name} PRIVATE -fsycl-link-huge-device-code) + endif() + target_include_directories(${python_module_name} + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../include + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/ + ) + target_link_options(${python_module_name} PRIVATE ${_linker_options}) + add_dependencies(${python_module_name} _dpctl4pybind11_deps) + install(TARGETS ${python_module_name} DESTINATION "dpctl/tensor") +endforeach() diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index f0930004ec..4355fea442 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -90,9 +90,12 @@ ) from dpctl.tensor._reshape import reshape from dpctl.tensor._search_functions import where +from dpctl.tensor._statistical_functions import mean, std, var from dpctl.tensor._usmarray import usm_ndarray from dpctl.tensor._utility_functions import all, any +from ._array_api import __array_api_version__, __array_namespace_info__ +from ._clip import clip from ._constants import e, inf, nan, newaxis, pi from ._elementwise_funcs import ( abs, @@ -110,13 +113,16 @@ bitwise_or, bitwise_right_shift, bitwise_xor, + cbrt, ceil, conj, + copysign, cos, cosh, divide, equal, exp, + exp2, expm1, floor, floor_divide, @@ -149,6 +155,7 @@ real, remainder, round, + rsqrt, sign, signbit, sin, @@ -160,7 +167,16 @@ tanh, trunc, ) -from ._reduction import sum +from ._reduction import ( + argmax, + argmin, + logsumexp, + max, + min, + prod, + reduce_hypot, + sum, +) from ._testing import allclose __all__ = [ @@ -309,4 +325,21 @@ "allclose", "repeat", "tile", + "max", + "min", + "argmax", + "argmin", + "prod", + "cbrt", + "exp2", + "copysign", + "rsqrt", + "clip", + "logsumexp", + "reduce_hypot", + "mean", + "std", + "var", + "__array_api_version__", + "__array_namespace_info__", ] diff --git a/dpctl/tensor/_array_api.py b/dpctl/tensor/_array_api.py new file mode 100644 index 0000000000..613d6dcd66 --- /dev/null +++ b/dpctl/tensor/_array_api.py @@ -0,0 +1,207 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dpctl +import dpctl.tensor as dpt +from dpctl.tensor._tensor_impl import ( + default_device_complex_type, + default_device_fp_type, + default_device_index_type, + default_device_int_type, +) + + +def _isdtype_impl(dtype, kind): + if isinstance(kind, str): + if kind == "bool": + return dtype.kind == "b" + elif kind == "signed integer": + return dtype.kind == "i" + elif kind == "unsigned integer": + return dtype.kind == "u" + elif kind == "integral": + return dtype.kind in "iu" + elif kind == "real floating": + return dtype.kind == "f" + elif kind == "complex floating": + return dtype.kind == "c" + elif kind == "numeric": + return dtype.kind in "iufc" + else: + raise ValueError(f"Unrecognized data type kind: {kind}") + + elif isinstance(kind, tuple): + return any(_isdtype_impl(dtype, k) for k in kind) + else: + raise TypeError(f"Unsupported data type kind: {kind}") + + +__array_api_version__ = "2022.12" + + +class Info: + """ + namespace returned by `__array_namespace_info__()` + """ + + def __init__(self): + self._capabilities = { + "boolean_indexing": True, + "data_dependent_shapes": True, + } + self._all_dtypes = { + "bool": dpt.bool, + "float32": dpt.float32, + "float64": dpt.float64, + "complex64": dpt.complex64, + "complex128": dpt.complex128, + "int8": dpt.int8, + "int16": dpt.int16, + "int32": dpt.int32, + "int64": dpt.int64, + "uint8": dpt.uint8, + "uint16": dpt.uint16, + "uint32": dpt.uint32, + "uint64": dpt.uint64, + } + + def capabilities(self): + """ + Returns a dictionary of `dpctl`'s capabilities. + + Returns: + dict: + dictionary of `dpctl`'s capabilities + - `boolean_indexing`: bool + - `data_dependent_shapes`: bool + """ + return self._capabilities.copy() + + def default_device(self): + """ + Returns the default SYCL device. + """ + return dpctl.select_default_device() + + def default_dtypes(self, device=None): + """ + Returns a dictionary of default data types for `device`. + + Args: + device (Optional[dpctl.SyclDevice, dpctl.SyclQueue, + dpctl.tensor.Device]): + array API concept of device used in getting default data types. + `device` can be `None` (in which case the default device is + used), an instance of :class:`dpctl.SyclDevice` corresponding + to a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a `Device` object returned by + :attr:`dpctl.tensor.usm_array.device`. Default: `None`. + + Returns: + dict: + a dictionary of default data types for `device` + - `real floating`: dtype + - `complex floating`: dtype + - `integral`: dtype + - `indexing`: dtype + """ + if device is None: + device = dpctl.select_default_device() + elif isinstance(device, dpt.Device): + device = device.sycl_device + return { + "real floating": dpt.dtype(default_device_fp_type(device)), + "complex floating": dpt.dtype(default_device_complex_type(device)), + "integral": dpt.dtype(default_device_int_type(device)), + "indexing": dpt.dtype(default_device_index_type(device)), + } + + def dtypes(self, device=None, kind=None): + """ + Returns a dictionary of all Array API data types of a specified `kind` + supported by `device` + + This dictionary only includes data types supported by the array API. + + See [array API](array_api). + + [array_api]: https://data-apis.org/array-api/latest/ + + Args: + device (Optional[dpctl.SyclDevice, dpctl.SyclQueue, + dpctl.tensor.Device, str]): + array API concept of device used in getting default data types. + `device` can be `None` (in which case the default device is + used), an instance of :class:`dpctl.SyclDevice` corresponding + to a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a `Device` object returned by + :attr:`dpctl.tensor.usm_array.device`. Default: `None`. + + kind (Optional[str, Tuple[str, ...]]): + data type kind. + - if `kind` is `None`, returns a dictionary of all data types + supported by `device` + - if `kind` is a string, returns a dictionary containing the + data types belonging to the data type kind specified. + Supports: + - "bool" + - "signed integer" + - "unsigned integer" + - "integral" + - "real floating" + - "complex floating" + - "numeric" + - if `kind` is a tuple, the tuple represents a union of `kind` + strings, and returns a dictionary containing data types + corresponding to the-specified union. + Default: `None`. + + Returns: + dict: + a dictionary of the supported data types of the specified `kind` + """ + if device is None: + device = dpctl.select_default_device() + elif isinstance(device, dpt.Device): + device = device.sycl_device + _fp64 = device.has_aspect_fp64 + if kind is None: + return { + key: val + for key, val in self._all_dtypes.items() + if (key != "float64" or _fp64) + } + else: + return { + key: val + for key, val in self._all_dtypes.items() + if (key != "float64" or _fp64) and _isdtype_impl(val, kind) + } + + def devices(self): + """ + Returns a list of supported devices. + """ + return dpctl.get_devices() + + +def __array_namespace_info__(): + """__array_namespace_info__() + + Returns a namespace with Array API namespace inspection utilities. + + """ + return Info() diff --git a/dpctl/tensor/_clip.py b/dpctl/tensor/_clip.py new file mode 100644 index 0000000000..eeed87b404 --- /dev/null +++ b/dpctl/tensor/_clip.py @@ -0,0 +1,838 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dpctl +import dpctl.tensor as dpt +import dpctl.tensor._tensor_elementwise_impl as tei +import dpctl.tensor._tensor_impl as ti +from dpctl.tensor._copy_utils import ( + _empty_like_orderK, + _empty_like_pair_orderK, + _empty_like_triple_orderK, +) +from dpctl.tensor._elementwise_common import ( + WeakBooleanType, + WeakComplexType, + WeakFloatingType, + WeakIntegralType, + _get_dtype, + _get_queue_usm_type, + _get_shape, + _strong_dtype_num_kind, + _validate_dtype, + _weak_type_num_kind, +) +from dpctl.tensor._manipulation_functions import _broadcast_shape_impl +from dpctl.tensor._type_utils import _can_cast, _to_device_supported_dtype +from dpctl.utils import ExecutionPlacementError + + +def _resolve_one_strong_two_weak_types(st_dtype, dtype1, dtype2, dev): + "Resolves weak data types per NEP-0050," + "where the second and third arguments are" + "permitted to be weak types" + if isinstance( + st_dtype, + ( + WeakBooleanType, + WeakIntegralType, + WeakFloatingType, + WeakComplexType, + ), + ): + raise ValueError + if isinstance( + dtype1, + (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType), + ): + if isinstance( + dtype2, + ( + WeakBooleanType, + WeakIntegralType, + WeakFloatingType, + WeakComplexType, + ), + ): + kind_num1 = _weak_type_num_kind(dtype1) + kind_num2 = _weak_type_num_kind(dtype2) + st_kind_num = _strong_dtype_num_kind(st_dtype) + + if kind_num1 > st_kind_num: + if isinstance(dtype1, WeakIntegralType): + ret_dtype1 = dpt.dtype(ti.default_device_int_type(dev)) + elif isinstance(dtype1, WeakComplexType): + if st_dtype is dpt.float16 or st_dtype is dpt.float32: + ret_dtype1 = dpt.complex64 + ret_dtype1 = _to_device_supported_dtype(dpt.complex128, dev) + else: + ret_dtype1 = _to_device_supported_dtype(dpt.float64, dev) + else: + ret_dtype1 = st_dtype + + if kind_num2 > st_kind_num: + if isinstance(dtype2, WeakIntegralType): + ret_dtype2 = dpt.dtype(ti.default_device_int_type(dev)) + elif isinstance(dtype2, WeakComplexType): + if st_dtype is dpt.float16 or st_dtype is dpt.float32: + ret_dtype2 = dpt.complex64 + ret_dtype2 = _to_device_supported_dtype(dpt.complex128, dev) + else: + ret_dtype2 = _to_device_supported_dtype(dpt.float64, dev) + else: + ret_dtype2 = st_dtype + + return ret_dtype1, ret_dtype2 + + max_dt_num_kind, max_dtype = max( + [ + (_strong_dtype_num_kind(st_dtype), st_dtype), + (_strong_dtype_num_kind(dtype2), dtype2), + ] + ) + dt1_kind_num = _weak_type_num_kind(dtype1) + if dt1_kind_num > max_dt_num_kind: + if isinstance(dtype1, WeakIntegralType): + return dpt.dtype(ti.default_device_int_type(dev)), dtype2 + if isinstance(dtype1, WeakComplexType): + if max_dtype is dpt.float16 or max_dtype is dpt.float32: + return dpt.complex64, dtype2 + return ( + _to_device_supported_dtype(dpt.complex128, dev), + dtype2, + ) + return _to_device_supported_dtype(dpt.float64, dev), dtype2 + else: + return max_dtype, dtype2 + elif isinstance( + dtype2, + (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType), + ): + max_dt_num_kind, max_dtype = max( + [ + (_strong_dtype_num_kind(st_dtype), st_dtype), + (_strong_dtype_num_kind(dtype1), dtype1), + ] + ) + dt2_kind_num = _weak_type_num_kind(dtype2) + if dt2_kind_num > max_dt_num_kind: + if isinstance(dtype2, WeakIntegralType): + return dtype1, dpt.dtype(ti.default_device_int_type(dev)) + if isinstance(dtype2, WeakComplexType): + if max_dtype is dpt.float16 or max_dtype is dpt.float32: + return dtype1, dpt.complex64 + return ( + dtype1, + _to_device_supported_dtype(dpt.complex128, dev), + ) + return dtype1, _to_device_supported_dtype(dpt.float64, dev) + else: + return dtype1, max_dtype + else: + # both are strong dtypes + # return unmodified + return dtype1, dtype2 + + +def _resolve_one_strong_one_weak_types(st_dtype, dtype, dev): + "Resolves one weak data type with one strong data type per NEP-0050" + if isinstance( + st_dtype, + (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType), + ): + raise ValueError + if isinstance( + dtype, + (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType), + ): + st_kind_num = _strong_dtype_num_kind(st_dtype) + kind_num = _weak_type_num_kind(dtype) + if kind_num > st_kind_num: + if isinstance(dtype, WeakIntegralType): + return dpt.dtype(ti.default_device_int_type(dev)) + if isinstance(dtype, WeakComplexType): + if st_dtype is dpt.float16 or st_dtype is dpt.float32: + return st_dtype, dpt.complex64 + return _to_device_supported_dtype(dpt.complex128, dev) + return (_to_device_supported_dtype(dpt.float64, dev),) + else: + return st_dtype + else: + return dtype + + +def _check_clip_dtypes(res_dtype, arg1_dtype, arg2_dtype, sycl_dev): + "Checks if both types `arg1_dtype` and `arg2_dtype` can be" + "cast to `res_dtype` according to the rule `safe`" + if arg1_dtype == res_dtype and arg2_dtype == res_dtype: + return None, None, res_dtype + + _fp16 = sycl_dev.has_aspect_fp16 + _fp64 = sycl_dev.has_aspect_fp64 + if _can_cast(arg1_dtype, res_dtype, _fp16, _fp64) and _can_cast( + arg2_dtype, res_dtype, _fp16, _fp64 + ): + # prevent unnecessary casting + ret_buf1_dt = None if res_dtype == arg1_dtype else res_dtype + ret_buf2_dt = None if res_dtype == arg2_dtype else res_dtype + return ret_buf1_dt, ret_buf2_dt, res_dtype + else: + return None, None, None + + +def _clip_none(x, val, out, order, _binary_fn): + if order not in ["K", "C", "F", "A"]: + order = "K" + q1, x_usm_type = x.sycl_queue, x.usm_type + q2, val_usm_type = _get_queue_usm_type(val) + if q2 is None: + exec_q = q1 + res_usm_type = x_usm_type + else: + exec_q = dpctl.utils.get_execution_queue((q1, q2)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpctl.utils.get_coerced_usm_type( + ( + x_usm_type, + val_usm_type, + ) + ) + dpctl.utils.validate_usm_type(res_usm_type, allow_none=False) + x_shape = x.shape + val_shape = _get_shape(val) + if not isinstance(val_shape, (tuple, list)): + raise TypeError( + "Shape of arguments can not be inferred. " + "Arguments are expected to be " + "lists, tuples, or both" + ) + try: + res_shape = _broadcast_shape_impl( + [ + x_shape, + val_shape, + ] + ) + except ValueError: + raise ValueError( + "operands could not be broadcast together with shapes " + f"{x_shape} and {val_shape}" + ) + sycl_dev = exec_q.sycl_device + x_dtype = x.dtype + val_dtype = _get_dtype(val, sycl_dev) + if not _validate_dtype(val_dtype): + raise ValueError("Operands have unsupported data types") + + val_dtype = _resolve_one_strong_one_weak_types(x_dtype, val_dtype, sycl_dev) + + res_dt = x.dtype + _fp16 = sycl_dev.has_aspect_fp16 + _fp64 = sycl_dev.has_aspect_fp64 + if not _can_cast(val_dtype, res_dt, _fp16, _fp64): + raise ValueError( + f"function 'clip' does not support input types " + f"({x_dtype}, {val_dtype}), " + "and the inputs could not be safely coerced to any " + "supported types according to the casting rule ''safe''." + ) + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + + if out.shape != res_shape: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {res_shape}, got {out.shape}" + ) + + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, got {out.dtype}" + ) + + if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + + if ti._array_overlap(x, out): + if not ti._same_logical_tensors(x, out): + out = dpt.empty_like(out) + + if isinstance(val, dpt.usm_ndarray): + if ( + ti._array_overlap(val, out) + and not ti._same_logical_tensors(val, out) + and val_dtype == res_dt + ): + out = dpt.empty_like(out) + + if isinstance(val, dpt.usm_ndarray): + val_ary = val + else: + val_ary = dpt.asarray(val, dtype=val_dtype, sycl_queue=exec_q) + + if val_dtype == res_dt: + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + x, val_ary, res_dt, res_shape, res_usm_type, exec_q + ) + else: + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + x, + val_ary, + ) + ) + else "C" + ) + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + if x_shape != res_shape: + x = dpt.broadcast_to(x, res_shape) + if val_ary.shape != res_shape: + val_ary = dpt.broadcast_to(val_ary, res_shape) + ht_binary_ev, binary_ev = _binary_fn( + src1=x, src2=val_ary, dst=out, sycl_queue=exec_q + ) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + ht_copy_out_ev.wait() + out = orig_out + ht_binary_ev.wait() + return out + else: + if order == "K": + buf = _empty_like_orderK(val_ary, res_dt) + else: + if order == "A": + order = "F" if x.flags.f_contiguous else "C" + buf = dpt.empty_like(val_ary, dtype=res_dt, order=order) + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=val_ary, dst=buf, sycl_queue=exec_q + ) + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + x, buf, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + if x_shape != res_shape: + x = dpt.broadcast_to(x, res_shape) + buf = dpt.broadcast_to(buf, res_shape) + ht_binary_ev, binary_ev = _binary_fn( + src1=x, + src2=buf, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + ht_copy_out_ev.wait() + out = orig_out + ht_copy_ev.wait() + ht_binary_ev.wait() + return out + + +# need to handle logic for min or max being None +def clip(x, min=None, max=None, out=None, order="K"): + """clip(x, min, max, out=None, order="K") + + Clips to the range [`min_i`, `max_i`] for each element `x_i` + in `x`. + + Args: + x (usm_ndarray): Array containing elements to clip. + Must be compatible with `min` and `max` according + to broadcasting rules. + min ({None, usm_ndarray}, optional): Array containing minimum values. + Must be compatible with `x` and `max` according + to broadcasting rules. + Only one of `min` and `max` can be `None`. + max ({None, usm_ndarray}, optional): Array containing maximum values. + Must be compatible with `x` and `min` according + to broadcasting rules. + Only one of `min` and `max` can be `None`. + out ({None, usm_ndarray}, optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the newly output array, if parameter `out` is + `None`. + Default: "K". + + Returns: + usm_ndarray: + An array with elements clipped to the range [`min`, `max`]. + The returned array has the same data type as `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected `x` to be of dpctl.tensor.usm_ndarray type, got " + f"{type(x)}" + ) + if min is None and max is None: + raise ValueError( + "only one of `min` and `max` is permitted to be `None`" + ) + elif max is None: + return _clip_none(x, min, out, order, tei._maximum) + elif min is None: + return _clip_none(x, max, out, order, tei._minimum) + else: + q1, x_usm_type = x.sycl_queue, x.usm_type + q2, min_usm_type = _get_queue_usm_type(min) + q3, max_usm_type = _get_queue_usm_type(max) + if q2 is None and q3 is None: + exec_q = q1 + res_usm_type = x_usm_type + elif q3 is None: + exec_q = dpctl.utils.get_execution_queue((q1, q2)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpctl.utils.get_coerced_usm_type( + ( + x_usm_type, + min_usm_type, + ) + ) + elif q2 is None: + exec_q = dpctl.utils.get_execution_queue((q1, q3)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpctl.utils.get_coerced_usm_type( + ( + x_usm_type, + max_usm_type, + ) + ) + else: + exec_q = dpctl.utils.get_execution_queue((q1, q2, q3)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpctl.utils.get_coerced_usm_type( + ( + x_usm_type, + min_usm_type, + max_usm_type, + ) + ) + dpctl.utils.validate_usm_type(res_usm_type, allow_none=False) + x_shape = x.shape + min_shape = _get_shape(min) + max_shape = _get_shape(max) + if not all( + isinstance(s, (tuple, list)) + for s in ( + min_shape, + max_shape, + ) + ): + raise TypeError( + "Shape of arguments can not be inferred. " + "Arguments are expected to be " + "lists, tuples, or both" + ) + try: + res_shape = _broadcast_shape_impl( + [ + x_shape, + min_shape, + max_shape, + ] + ) + except ValueError: + raise ValueError( + "operands could not be broadcast together with shapes " + f"{x_shape}, {min_shape}, and {max_shape}" + ) + sycl_dev = exec_q.sycl_device + x_dtype = x.dtype + min_dtype = _get_dtype(min, sycl_dev) + max_dtype = _get_dtype(max, sycl_dev) + if not all(_validate_dtype(o) for o in (min_dtype, max_dtype)): + raise ValueError("Operands have unsupported data types") + + min_dtype, max_dtype = _resolve_one_strong_two_weak_types( + x_dtype, min_dtype, max_dtype, sycl_dev + ) + + buf1_dt, buf2_dt, res_dt = _check_clip_dtypes( + x_dtype, + min_dtype, + max_dtype, + sycl_dev, + ) + + if res_dt is None: + raise ValueError( + f"function '{clip}' does not support input types " + f"({x_dtype}, {min_dtype}, {max_dtype}), " + "and the inputs could not be safely coerced to any " + "supported types according to the casting rule ''safe''." + ) + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + "output array must be of usm_ndarray type, got " + f"{type(out)}" + ) + + if out.shape != res_shape: + raise ValueError( + "The shape of input and output arrays are " + f"inconsistent. Expected output shape is {res_shape}, " + f"got {out.shape}" + ) + + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, " + f"got {out.dtype}" + ) + + if ( + dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) + is None + ): + raise ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + + if ti._array_overlap(x, out): + if not ti._same_logical_tensors(x, out): + out = dpt.empty_like(out) + + if isinstance(min, dpt.usm_ndarray): + if ( + ti._array_overlap(min, out) + and not ti._same_logical_tensors(min, out) + and buf1_dt is None + ): + out = dpt.empty_like(out) + + if isinstance(max, dpt.usm_ndarray): + if ( + ti._array_overlap(max, out) + and not ti._same_logical_tensors(max, out) + and buf2_dt is None + ): + out = dpt.empty_like(out) + + if isinstance(min, dpt.usm_ndarray): + a_min = min + else: + a_min = dpt.asarray(min, dtype=min_dtype, sycl_queue=exec_q) + if isinstance(max, dpt.usm_ndarray): + a_max = max + else: + a_max = dpt.asarray(max, dtype=max_dtype, sycl_queue=exec_q) + + if buf1_dt is None and buf2_dt is None: + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, + a_min, + a_max, + res_dt, + res_shape, + res_usm_type, + exec_q, + ) + else: + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + x, + a_min, + a_max, + ) + ) + else "C" + ) + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + if x_shape != res_shape: + x = dpt.broadcast_to(x, res_shape) + if a_min.shape != res_shape: + a_min = dpt.broadcast_to(a_min, res_shape) + if a_max.shape != res_shape: + a_max = dpt.broadcast_to(a_max, res_shape) + ht_binary_ev, binary_ev = ti._clip( + src=x, min=a_min, max=a_max, dst=out, sycl_queue=exec_q + ) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + ht_copy_out_ev.wait() + out = orig_out + ht_binary_ev.wait() + return out + + elif buf1_dt is None: + if order == "K": + buf2 = _empty_like_orderK(a_max, buf2_dt) + else: + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + x, + a_min, + ) + ) + else "C" + ) + buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order) + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_max, dst=buf2, sycl_queue=exec_q + ) + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, + a_min, + buf2, + res_dt, + res_shape, + res_usm_type, + exec_q, + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + x = dpt.broadcast_to(x, res_shape) + if a_min.shape != res_shape: + a_min = dpt.broadcast_to(a_min, res_shape) + buf2 = dpt.broadcast_to(buf2, res_shape) + ht_binary_ev, binary_ev = ti._clip( + src=x, + min=a_min, + max=buf2, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + ht_copy_out_ev.wait() + out = orig_out + ht_copy_ev.wait() + ht_binary_ev.wait() + return out + + elif buf2_dt is None: + if order == "K": + buf1 = _empty_like_orderK(a_min, buf1_dt) + else: + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + x, + a_max, + ) + ) + else "C" + ) + buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order) + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_min, dst=buf1, sycl_queue=exec_q + ) + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, + buf1, + a_max, + res_dt, + res_shape, + res_usm_type, + exec_q, + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + x = dpt.broadcast_to(x, res_shape) + buf1 = dpt.broadcast_to(buf1, res_shape) + if a_max.shape != res_shape: + a_max = dpt.broadcast_to(a_max, res_shape) + ht_binary_ev, binary_ev = ti._clip( + src=x, + min=buf1, + max=a_max, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + ht_copy_out_ev.wait() + out = orig_out + ht_copy_ev.wait() + ht_binary_ev.wait() + return out + + if order in ["K", "A"]: + if ( + x.flags.f_contiguous + and a_min.flags.f_contiguous + and a_max.flags.f_contiguous + ): + order = "F" + elif ( + x.flags.c_contiguous + and a_min.flags.c_contiguous + and a_max.flags.c_contiguous + ): + order = "C" + else: + order = "C" if order == "A" else "K" + if order == "K": + buf1 = _empty_like_orderK(a_min, buf1_dt) + else: + buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order) + ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_min, dst=buf1, sycl_queue=exec_q + ) + if order == "K": + buf2 = _empty_like_orderK(a_max, buf2_dt) + else: + buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order) + ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_max, dst=buf2, sycl_queue=exec_q + ) + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, buf1, buf2, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + x = dpt.broadcast_to(x, res_shape) + buf1 = dpt.broadcast_to(buf1, res_shape) + buf2 = dpt.broadcast_to(buf2, res_shape) + ht_, _ = ti._clip( + src=x, + min=buf1, + max=buf2, + dst=out, + sycl_queue=exec_q, + depends=[copy1_ev, copy2_ev], + ) + dpctl.SyclEvent.wait_for([ht_copy1_ev, ht_copy2_ev, ht_]) + return out diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py index bc1b071460..81928692a6 100644 --- a/dpctl/tensor/_copy_utils.py +++ b/dpctl/tensor/_copy_utils.py @@ -37,7 +37,7 @@ def _copy_to_numpy(ary): if not isinstance(ary, dpt.usm_ndarray): - raise TypeError + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(ary)}") nb = ary.usm_data.nbytes hh = dpm.MemoryUSMHost(nb, queue=ary.sycl_queue) hh.copy_from_device(ary.usm_data) diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py index fca5b0734a..4a0d1c451f 100644 --- a/dpctl/tensor/_elementwise_common.py +++ b/dpctl/tensor/_elementwise_common.py @@ -39,6 +39,31 @@ class UnaryElementwiseFunc: """ Class that implements unary element-wise functions. + + Args: + name (str): + Name of the unary function + result_type_resovler_fn (callable): + Function that takes dtype of the input and + returns the dtype of the result if the + implementation functions supports it, or + returns `None` otherwise. + unary_dp_impl_fn (callable): + Data-parallel implementation function with signature + `impl_fn(src: usm_ndarray, dst: usm_ndarray, + sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])` + where the `src` is the argument array, `dst` is the + array to be populated with function values, effectively + evaluating `dst = func(src)`. + The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s. + The first event corresponds to data-management host tasks, + including lifetime management of argument Python objects to ensure + that their associated USM allocation is not freed before offloaded + computational tasks complete execution, while the second event + corresponds to computational tasks associated with function + evaluation. + docs (str): + Documentation string for the unary function. """ def __init__(self, name, result_type_resolver_fn, unary_dp_impl_fn, docs): @@ -55,8 +80,31 @@ def __str__(self): def __repr__(self): return f"<{self.__name__} '{self.name_}'>" + def get_implementation_function(self): + """Returns the implementation function for + this elementwise unary function. + + """ + return self.unary_fn_ + + def get_type_result_resolver_function(self): + """Returns the type resolver function for this + elementwise unary function. + """ + return self.result_type_resolver_fn_ + @property def types(self): + """Returns information about types supported by + implementation function, using NumPy's character + encoding for data types, e.g. + + :Example: + .. code-block:: python + + dpctl.tensor.sin.types + # Outputs: ['e->e', 'f->f', 'd->d', 'F->F', 'D->D'] + """ types = self.types_ if not types: types = [] @@ -363,6 +411,56 @@ def _get_shape(o): class BinaryElementwiseFunc: """ Class that implements binary element-wise functions. + + Args: + name (str): + Name of the unary function + result_type_resovle_fn (callable): + Function that takes dtypes of the input and + returns the dtype of the result if the + implementation functions supports it, or + returns `None` otherwise. + binary_dp_impl_fn (callable): + Data-parallel implementation function with signature + `impl_fn(src1: usm_ndarray, src2: usm_ndarray, dst: usm_ndarray, + sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])` + where the `src1` and `src2` are the argument arrays, `dst` is the + array to be populated with function values, + i.e. `dst=func(src1, src2)`. + The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s. + The first event corresponds to data-management host tasks, + including lifetime management of argument Python objects to ensure + that their associated USM allocation is not freed before offloaded + computational tasks complete execution, while the second event + corresponds to computational tasks associated with function + evaluation. + docs (str): + Documentation string for the unary function. + binary_inplace_fn (callable, optional): + Data-parallel implementation function with signature + `impl_fn(src: usm_ndarray, dst: usm_ndarray, + sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])` + where the `src` is the argument array, `dst` is the + array to be populated with function values, + i.e. `dst=func(dst, src)`. + The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s. + The first event corresponds to data-management host tasks, + including async lifetime management of Python arguments, + while the second event corresponds to computational tasks + associated with function evaluation. + acceptance_fn (callable, optional): + Function to influence type promotion behavior of this binary + function. The function takes 6 arguments: + arg1_dtype - Data type of the first argument + arg2_dtype - Data type of the second argument + ret_buf1_dtype - Data type the first argument would be cast to + ret_buf2_dtype - Data type the second argument would be cast to + res_dtype - Data type of the output array with function values + sycl_dev - The :class:`dpctl.SyclDevice` where the function + evaluation is carried out. + The function is only called when both arguments of the binary + function require casting, e.g. both arguments of + `dpctl.tensor.logaddexp` are arrays with integral data type. """ def __init__( @@ -392,8 +490,60 @@ def __str__(self): def __repr__(self): return f"<{self.__name__} '{self.name_}'>" + def get_implementation_function(self): + """Returns the out-of-place implementation + function for this elementwise binary function. + + """ + return self.binary_fn_ + + def get_implementation_inplace_function(self): + """Returns the in-place implementation + function for this elementwise binary function. + + """ + return self.binary_inplace_fn_ + + def get_type_result_resolver_function(self): + """Returns the type resolver function for this + elementwise binary function. + """ + return self.result_type_resolver_fn_ + + def get_type_promotion_path_acceptance_function(self): + """Returns the acceptance function for this + elementwise binary function. + + Acceptance function influences the type promotion + behavior of this binary function. + The function takes 6 arguments: + arg1_dtype - Data type of the first argument + arg2_dtype - Data type of the second argument + ret_buf1_dtype - Data type the first argument would be cast to + ret_buf2_dtype - Data type the second argument would be cast to + res_dtype - Data type of the output array with function values + sycl_dev - :class:`dpctl.SyclDevice` on which function evaluation + is carried out. + + The acceptance function is only invoked if both input arrays must be + cast to intermediary data types, as would happen during call of + `dpctl.tensor.hypot` with both arrays being of integral data type. + """ + return self.acceptance_fn_ + @property def types(self): + """Returns information about types supported by + implementation function, using NumPy's character + encoding for data types, e.g. + + :Example: + .. code-block:: python + + dpctl.tensor.divide.types + # Outputs: ['ee->e', 'ff->f', 'fF->F', 'dd->d', 'dD->D', + # 'Ff->F', 'FF->F', 'Dd->D', 'DD->D'] + """ types = self.types_ if not types: types = [] @@ -649,12 +799,7 @@ def __call__(self, o1, o2, out=None, order="K"): sycl_queue=exec_q, order=order, ) - else: - if res_dt != out.dtype: - raise TypeError( - f"Output array of type {res_dt} is needed," - f"got {out.dtype}" - ) + if src1.shape != res_shape: src1 = dpt.broadcast_to(src1, res_shape) buf2 = dpt.broadcast_to(buf2, res_shape) diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py index 8e2abee837..9879960999 100644 --- a/dpctl/tensor/_elementwise_funcs.py +++ b/dpctl/tensor/_elementwise_funcs.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import dpctl.tensor._tensor_impl as ti +import dpctl.tensor._tensor_elementwise_impl as ti from ._elementwise_common import BinaryElementwiseFunc, UnaryElementwiseFunc from ._type_utils import _acceptance_fn_divide @@ -297,6 +297,7 @@ ti._bitwise_and_result_type, ti._bitwise_and, _bitwise_and_docstring_, + binary_inplace_fn=ti._bitwise_and_inplace, ) # B04: ===== BITWISE_LEFT_SHIFT (x1, x2) @@ -330,6 +331,7 @@ ti._bitwise_left_shift_result_type, ti._bitwise_left_shift, _bitwise_left_shift_docstring_, + binary_inplace_fn=ti._bitwise_left_shift_inplace, ) @@ -393,6 +395,7 @@ ti._bitwise_or_result_type, ti._bitwise_or, _bitwise_or_docstring_, + binary_inplace_fn=ti._bitwise_or_inplace, ) # B06: ===== BITWISE_RIGHT_SHIFT (x1, x2) @@ -425,6 +428,7 @@ ti._bitwise_right_shift_result_type, ti._bitwise_right_shift, _bitwise_right_shift_docstring_, + binary_inplace_fn=ti._bitwise_right_shift_inplace, ) @@ -459,6 +463,7 @@ ti._bitwise_xor_result_type, ti._bitwise_xor, _bitwise_xor_docstring_, + binary_inplace_fn=ti._bitwise_xor_inplace, ) @@ -590,6 +595,7 @@ ti._divide_result_type, ti._divide, _divide_docstring_, + binary_inplace_fn=ti._divide_inplace, acceptance_fn=_acceptance_fn_divide, ) @@ -720,6 +726,7 @@ ti._floor_divide_result_type, ti._floor_divide, _floor_divide_docstring_, + binary_inplace_fn=ti._floor_divide_inplace, ) # B11: ==== GREATER (x1, x2) @@ -1176,7 +1183,7 @@ _logical_xor_docstring_, ) -# B??: ==== MAXIMUM (x1, x2) +# B26: ==== MAXIMUM (x1, x2) _maximum_docstring_ = """ maximum(x1, x2, out=None, order='K') @@ -1206,7 +1213,7 @@ _maximum_docstring_, ) -# B??: ==== MINIMUM (x1, x2) +# B27: ==== MINIMUM (x1, x2) _minimum_docstring_ = """ minimum(x1, x2, out=None, order='K') @@ -1264,7 +1271,7 @@ ti._multiply_result_type, ti._multiply, _multiply_docstring_, - ti._multiply_inplace, + binary_inplace_fn=ti._multiply_inplace, ) # U25: ==== NEGATIVE (x) @@ -1359,10 +1366,14 @@ the returned array is determined by the Type Promotion Rules. """ pow = BinaryElementwiseFunc( - "pow", ti._pow_result_type, ti._pow, _pow_docstring_ + "pow", + ti._pow_result_type, + ti._pow, + _pow_docstring_, + binary_inplace_fn=ti._pow_inplace, ) -# U??: ==== PROJ (x) +# U40: ==== PROJ (x) _proj_docstring = """ proj(x, out=None, order='K') @@ -1441,7 +1452,11 @@ the returned array is determined by the Type Promotion Rules. """ remainder = BinaryElementwiseFunc( - "remainder", ti._remainder_result_type, ti._remainder, _remainder_docstring_ + "remainder", + ti._remainder_result_type, + ti._remainder, + _remainder_docstring_, + binary_inplace_fn=ti._remainder_inplace, ) # U28: ==== ROUND (x) @@ -1499,7 +1514,7 @@ "sign", ti._sign_result_type, ti._sign, _sign_docstring ) -# ==== SIGNBIT (x) +# U41: ==== SIGNBIT (x) _signbit_docstring = """ signbit(x, out=None, order='K') @@ -1652,7 +1667,7 @@ ti._subtract_result_type, ti._subtract, _subtract_docstring_, - ti._subtract_inplace, + binary_inplace_fn=ti._subtract_inplace, ) @@ -1759,3 +1774,116 @@ hypot = BinaryElementwiseFunc( "hypot", ti._hypot_result_type, ti._hypot, _hypot_docstring_ ) + + +# U37: ==== CBRT (x) +_cbrt_docstring_ = """ +cbrt(x, out=None, order='K') + +Computes positive cube-root for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a real floating-point data type. + out ({None, usm_ndarray}, optional): + Output array to populate. + Array have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the newly output array, if parameter `out` is `None`. + Default: "K". +Returns: + usm_narray: + An array containing the element-wise positive cube-root. + The data type of the returned array is determined by + the Type Promotion Rules. +""" + +cbrt = UnaryElementwiseFunc( + "cbrt", ti._cbrt_result_type, ti._cbrt, _cbrt_docstring_ +) + + +# U38: ==== EXP2 (x) +_exp2_docstring_ = """ +exp2(x, out=None, order='K') + +Computes the base-2 exponential for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out ({None, usm_ndarray}, optional): + Output array to populate. + Array have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the newly output array, if parameter `out` is `None`. + Default: "K". +Returns: + usm_narray: + An array containing the element-wise base-2 exponentials. + The data type of the returned array is determined by + the Type Promotion Rules. +""" + +exp2 = UnaryElementwiseFunc( + "exp2", ti._exp2_result_type, ti._exp2, _exp2_docstring_ +) + + +# B25: ==== COPYSIGN (x1, x2) +_copysign_docstring_ = """ +copysign(x1, x2, out=None, order='K') + +Composes a floating-point value with the magnitude of `x1_i` and the sign of +`x2_i` for each element of input arrays `x1` and `x2`. + +Args: + x1 (usm_ndarray): + First input array, expected to have a real floating-point data type. + x2 (usm_ndarray): + Second input array, also expected to have a real floating-point data + type. + out ({None, usm_ndarray}, optional): + Output array to populate. + Array have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the newly output array, if parameter `out` is `None`. + Default: "K". +Returns: + usm_narray: + An array containing the element-wise results. The data type + of the returned array is determined by the Type Promotion Rules. +""" +copysign = BinaryElementwiseFunc( + "copysign", + ti._copysign_result_type, + ti._copysign, + _copysign_docstring_, +) + + +# U39: ==== RSQRT (x) +_rsqrt_docstring_ = """ +rsqrt(x, out=None, order='K') + +Computes the reciprocal square-root for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a real floating-point data type. + out ({None, usm_ndarray}, optional): + Output array to populate. + Array have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the newly output array, if parameter `out` is `None`. + Default: "K". +Returns: + usm_narray: + An array containing the element-wise reciprocal square-root. + The data type of the returned array is determined by + the Type Promotion Rules. +""" + +rsqrt = UnaryElementwiseFunc( + "rsqrt", ti._rsqrt_result_type, ti._rsqrt, _rsqrt_docstring_ +) diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py index 7201cd96fb..7135304b58 100644 --- a/dpctl/tensor/_manipulation_functions.py +++ b/dpctl/tensor/_manipulation_functions.py @@ -19,7 +19,6 @@ import operator import numpy as np -from numpy import AxisError from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple import dpctl @@ -929,20 +928,26 @@ def repeat(x, repeats, axis=None): Args: x (usm_ndarray): input array - repeat (Union[int, Tuple[int, ...]]): + repeats (Union[int, Sequence[int, ...], usm_ndarray]): The number of repetitions for each element. - `repeats` is broadcasted to fit the shape of the given axis. + `repeats` is broadcast to fit the shape of the given axis. + If `repeats` is an array, it must have an integer data type. + Otherwise, `repeats` must be a Python integer, tuple, list, or + range. axis (Optional[int]): - The axis along which to repeat values. The `axis` is required - if input array has more than one dimension. + The axis along which to repeat values. If `axis` is `None`, the + function repeats elements of the flattened array. + Default: `None`. Returns: usm_narray: Array with repeated elements. - The returned array must have the same data type as `x`, - is created on the same device as `x` and has the same USM - allocation type as `x`. + The returned array must have the same data type as `x`, is created + on the same device as `x` and has the same USM allocation type as + `x`. If `axis` is `None`, the returned array is one-dimensional, + otherwise, it has the same shape as `x`, except for the axis along + which elements were repeated. Raises: AxisError: if `axis` value is invalid. @@ -951,20 +956,11 @@ def repeat(x, repeats, axis=None): raise TypeError(f"Expected usm_ndarray type, got {type(x)}.") x_ndim = x.ndim - if axis is None: - if x_ndim > 1: - raise ValueError( - f"`axis` cannot be `None` for array of dimension {x_ndim}" - ) - axis = 0 - x_shape = x.shape - if x_ndim > 0: + if axis is not None: axis = normalize_axis_index(operator.index(axis), x_ndim) axis_size = x_shape[axis] else: - if axis != 0: - AxisError("`axis` must be `0` for input of dimension `0`") axis_size = x.size scalar = False @@ -977,8 +973,8 @@ def repeat(x, repeats, axis=None): elif isinstance(repeats, dpt.usm_ndarray): if repeats.ndim > 1: raise ValueError( - "`repeats` array must be 0- or 1-dimensional, got" - "{repeats.ndim}" + "`repeats` array must be 0- or 1-dimensional, got " + f"{repeats.ndim}" ) exec_q = dpctl.utils.get_execution_queue( (x.sycl_queue, repeats.sycl_queue) @@ -1015,22 +1011,22 @@ def repeat(x, repeats, axis=None): if not dpt.all(repeats >= 0): raise ValueError("`repeats` elements must be positive") - elif isinstance(repeats, tuple): + elif isinstance(repeats, (tuple, list, range)): usm_type = x.usm_type exec_q = x.sycl_queue len_reps = len(repeats) - if len_reps != axis_size: - raise ValueError( - "`repeats` tuple must have the same length as the repeated " - "axis" - ) - elif len_reps == 1: + if len_reps == 1: repeats = repeats[0] if repeats < 0: raise ValueError("`repeats` elements must be positive") scalar = True else: + if len_reps != axis_size: + raise ValueError( + "`repeats` sequence must have the same length as the " + "repeated axis" + ) repeats = dpt.asarray( repeats, dtype=dpt.int64, usm_type=usm_type, sycl_queue=exec_q ) @@ -1038,7 +1034,7 @@ def repeat(x, repeats, axis=None): raise ValueError("`repeats` elements must be positive") else: raise TypeError( - "Expected int, tuple, or `usm_ndarray` for second argument," + "Expected int, sequence, or `usm_ndarray` for second argument," f"got {type(repeats)}" ) @@ -1047,7 +1043,10 @@ def repeat(x, repeats, axis=None): if scalar: res_axis_size = repeats * axis_size - res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + if axis is not None: + res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + else: + res_shape = (res_axis_size,) res = dpt.empty( res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q ) @@ -1081,9 +1080,17 @@ def repeat(x, repeats, axis=None): res_axis_size = ti._cumsum_1d( rep_buf, cumsum, sycl_queue=exec_q, depends=[copy_ev] ) - res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + if axis is not None: + res_shape = ( + x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + ) + else: + res_shape = (res_axis_size,) res = dpt.empty( - res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q + res_shape, + dtype=x.dtype, + usm_type=usm_type, + sycl_queue=exec_q, ) if res_axis_size > 0: ht_rep_ev, _ = ti._repeat_by_sequence( @@ -1103,11 +1110,18 @@ def repeat(x, repeats, axis=None): usm_type=usm_type, sycl_queue=exec_q, ) - # _cumsum_1d synchronizes so `depends` ends here safely res_axis_size = ti._cumsum_1d(repeats, cumsum, sycl_queue=exec_q) - res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + if axis is not None: + res_shape = ( + x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + ) + else: + res_shape = (res_axis_size,) res = dpt.empty( - res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q + res_shape, + dtype=x.dtype, + usm_type=usm_type, + sycl_queue=exec_q, ) if res_axis_size > 0: ht_rep_ev, _ = ti._repeat_by_sequence( diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index d9bd6b5b2b..059ba61030 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -19,6 +19,7 @@ import dpctl import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti +import dpctl.tensor._tensor_reductions_impl as tri from ._type_utils import _to_device_supported_dtype @@ -52,18 +53,143 @@ def _default_reduction_dtype(inp_dt, q): return res_dt -def sum(arr, axis=None, dtype=None, keepdims=False): +def _default_reduction_dtype_fp_types(inp_dt, q): + """Gives default output data type for given input data + type `inp_dt` when reduction is performed on queue `q` + and the reduction supports only floating-point data types + """ + inp_kind = inp_dt.kind + if inp_kind in "biu": + res_dt = dpt.dtype(ti.default_device_fp_type(q)) + can_cast_v = dpt.can_cast(inp_dt, res_dt) + if not can_cast_v: + _fp64 = q.sycl_device.has_aspect_fp64 + res_dt = dpt.float64 if _fp64 else dpt.float32 + elif inp_kind in "f": + res_dt = dpt.dtype(ti.default_device_fp_type(q)) + if res_dt.itemsize < inp_dt.itemsize: + res_dt = inp_dt + elif inp_kind in "c": + raise TypeError("reduction not defined for complex types") + + return res_dt + + +def _reduction_over_axis( + x, + axis, + dtype, + keepdims, + _reduction_fn, + _dtype_supported, + _default_reduction_type_fn, +): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + nd = x.ndim + if axis is None: + axis = tuple(range(nd)) + if not isinstance(axis, (tuple, list)): + axis = (axis,) + axis = normalize_axis_tuple(axis, nd, "axis") + red_nd = len(axis) + perm = [i for i in range(nd) if i not in axis] + list(axis) + arr2 = dpt.permute_dims(x, perm) + res_shape = arr2.shape[: nd - red_nd] + q = x.sycl_queue + inp_dt = x.dtype + if dtype is None: + res_dt = _default_reduction_type_fn(inp_dt, q) + else: + res_dt = dpt.dtype(dtype) + res_dt = _to_device_supported_dtype(res_dt, q.sycl_device) + + res_usm_type = x.usm_type + if red_nd == 0: + return dpt.astype(x, res_dt, copy=True) + + host_tasks_list = [] + if _dtype_supported(inp_dt, res_dt, res_usm_type, q): + res = dpt.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e, _ = _reduction_fn( + src=arr2, trailing_dims_to_reduce=red_nd, dst=res, sycl_queue=q + ) + host_tasks_list.append(ht_e) + else: + if dtype is None: + raise RuntimeError( + "Automatically determined reduction data type does not " + "have direct implementation" + ) + if _dtype_supported(res_dt, res_dt, res_usm_type, q): + tmp = dpt.empty( + arr2.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr2, dst=tmp, sycl_queue=q + ) + host_tasks_list.append(ht_e_cpy) + res = dpt.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_red, _ = _reduction_fn( + src=tmp, + trailing_dims_to_reduce=red_nd, + dst=res, + sycl_queue=q, + depends=[cpy_e], + ) + host_tasks_list.append(ht_e_red) + else: + buf_dt = _default_reduction_type_fn(inp_dt, q) + tmp = dpt.empty( + arr2.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr2, dst=tmp, sycl_queue=q + ) + tmp_res = dpt.empty( + res_shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q + ) + host_tasks_list.append(ht_e_cpy) + res = dpt.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_red, r_e = _reduction_fn( + src=tmp, + trailing_dims_to_reduce=red_nd, + dst=tmp_res, + sycl_queue=q, + depends=[cpy_e], + ) + ht_e_cpy2, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=tmp_res, dst=res, sycl_queue=q, depends=[r_e] + ) + host_tasks_list.append(ht_e_cpy2) + + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm) + dpctl.SyclEvent.wait_for(host_tasks_list) + + return res + + +def sum(x, axis=None, dtype=None, keepdims=False): """sum(x, axis=None, dtype=None, keepdims=False) - Calculates the sum of the input array `x`. + Calculates the sum of elements in the input array `x`. Args: x (usm_ndarray): input array. - axis (Optional[int, Tuple[int,...]]): + axis (Optional[int, Tuple[int, ...]]): axis or axes along which sums must be computed. If a tuple of unique integers, sums are computed over multiple axes. - If `None`, the sum if computed over the entire array. + If `None`, the sum is computed over the entire array. Default: `None`. dtype (Optional[dtype]): data type of the returned array. If `None`, the default data @@ -72,13 +198,13 @@ def sum(arr, axis=None, dtype=None, keepdims=False): the returned array will have the default real-valued floating-point data type for the device where input array `x` is allocated. - * If x` has signed integral data type, the returned array + * If `x` has signed integral data type, the returned array will have the default signed integral type for the device where input array `x` is allocated. * If `x` has unsigned integral data type, the returned array will have the default unsigned integral type for the device where input array `x` is allocated. - * If `x` has a complex-valued floating-point data typee, + * If `x` has a complex-valued floating-point data type, the returned array will have the default complex-valued floating-pointer data type for the device where input array `x` is allocated. @@ -101,9 +227,192 @@ def sum(arr, axis=None, dtype=None, keepdims=False): array has the data type as described in the `dtype` parameter description above. """ - if not isinstance(arr, dpt.usm_ndarray): - raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(arr)}") - nd = arr.ndim + return _reduction_over_axis( + x, + axis, + dtype, + keepdims, + tri._sum_over_axis, + tri._sum_over_axis_dtype_supported, + _default_reduction_dtype, + ) + + +def prod(x, axis=None, dtype=None, keepdims=False): + """prod(x, axis=None, dtype=None, keepdims=False) + + Calculates the product of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which products must be computed. If a tuple + of unique integers, products are computed over multiple axes. + If `None`, the product is computed over the entire array. + Default: `None`. + dtype (Optional[dtype]): + data type of the returned array. If `None`, the default data + type is inferred from the "kind" of the input array data type. + * If `x` has a real-valued floating-point data type, + the returned array will have the default real-valued + floating-point data type for the device where input + array `x` is allocated. + * If `x` has signed integral data type, the returned array + will have the default signed integral type for the device + where input array `x` is allocated. + * If `x` has unsigned integral data type, the returned array + will have the default unsigned integral type for the device + where input array `x` is allocated. + * If `x` has a complex-valued floating-point data type, + the returned array will have the default complex-valued + floating-pointer data type for the device where input + array `x` is allocated. + * If `x` has a boolean data type, the returned array will + have the default signed integral type for the device + where input array `x` is allocated. + If the data type (either specified or resolved) differs from the + data type of `x`, the input array elements are cast to the + specified data type before computing the product. Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the products. If the product was computed over + the entire array, a zero-dimensional array is returned. The returned + array has the data type as described in the `dtype` parameter + description above. + """ + return _reduction_over_axis( + x, + axis, + dtype, + keepdims, + tri._prod_over_axis, + tri._prod_over_axis_dtype_supported, + _default_reduction_dtype, + ) + + +def logsumexp(x, axis=None, dtype=None, keepdims=False): + """logsumexp(x, axis=None, dtype=None, keepdims=False) + + Calculates the logarithm of the sum of exponentials of elements in the + input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which values must be computed. If a tuple + of unique integers, values are computed over multiple axes. + If `None`, the result is computed over the entire array. + Default: `None`. + dtype (Optional[dtype]): + data type of the returned array. If `None`, the default data + type is inferred from the "kind" of the input array data type. + * If `x` has a real-valued floating-point data type, + the returned array will have the default real-valued + floating-point data type for the device where input + array `x` is allocated. + * If `x` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array `x` is allocated. + * If `x` has a complex-valued floating-point data type, + an error is raised. + If the data type (either specified or resolved) differs from the + data type of `x`, the input array elements are cast to the + specified data type before computing the result. Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the results. If the result was computed over + the entire array, a zero-dimensional array is returned. The returned + array has the data type as described in the `dtype` parameter + description above. + """ + return _reduction_over_axis( + x, + axis, + dtype, + keepdims, + tri._logsumexp_over_axis, + lambda inp_dt, res_dt, *_: tri._logsumexp_over_axis_dtype_supported( + inp_dt, res_dt + ), + _default_reduction_dtype_fp_types, + ) + + +def reduce_hypot(x, axis=None, dtype=None, keepdims=False): + """reduce_hypot(x, axis=None, dtype=None, keepdims=False) + + Calculates the square root of the sum of squares of elements in the input + array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which values must be computed. If a tuple + of unique integers, values are computed over multiple axes. + If `None`, the result is computed over the entire array. + Default: `None`. + dtype (Optional[dtype]): + data type of the returned array. If `None`, the default data + type is inferred from the "kind" of the input array data type. + * If `x` has a real-valued floating-point data type, + the returned array will have the default real-valued + floating-point data type for the device where input + array `x` is allocated. + * If `x` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array `x` is allocated. + * If `x` has a complex-valued floating-point data type, + an error is raised. + If the data type (either specified or resolved) differs from the + data type of `x`, the input array elements are cast to the + specified data type before computing the result. Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the results. If the result was computed over + the entire array, a zero-dimensional array is returned. The returned + array has the data type as described in the `dtype` parameter + description above. + """ + return _reduction_over_axis( + x, + axis, + dtype, + keepdims, + tri._hypot_over_axis, + lambda inp_dt, res_dt, *_: tri._hypot_over_axis_dtype_supported( + inp_dt, res_dt + ), + _default_reduction_dtype_fp_types, + ) + + +def _comparison_over_axis(x, axis, keepdims, _reduction_fn): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + + nd = x.ndim if axis is None: axis = tuple(range(nd)) if not isinstance(axis, (tuple, list)): @@ -111,63 +420,221 @@ def sum(arr, axis=None, dtype=None, keepdims=False): axis = normalize_axis_tuple(axis, nd, "axis") red_nd = len(axis) perm = [i for i in range(nd) if i not in axis] + list(axis) - arr2 = dpt.permute_dims(arr, perm) - res_shape = arr2.shape[: nd - red_nd] - q = arr.sycl_queue - inp_dt = arr.dtype - if dtype is None: - res_dt = _default_reduction_dtype(inp_dt, q) - else: - res_dt = dpt.dtype(dtype) - res_dt = _to_device_supported_dtype(res_dt, q.sycl_device) - - res_usm_type = arr.usm_type - if arr.size == 0: - if keepdims: - res_shape = res_shape + (1,) * red_nd - inv_perm = sorted(range(nd), key=lambda d: perm[d]) - res_shape = tuple(res_shape[i] for i in inv_perm) - return dpt.zeros( - res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q - ) + x_tmp = dpt.permute_dims(x, perm) + res_shape = x_tmp.shape[: nd - red_nd] + exec_q = x.sycl_queue + res_dt = x.dtype + res_usm_type = x.usm_type + if x.size == 0: + if any([x.shape[i] == 0 for i in axis]): + raise ValueError( + "reduction cannot be performed over zero-size axes" + ) + else: + return dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + ) if red_nd == 0: - return dpt.astype(arr, res_dt, copy=False) + return dpt.copy(x) - host_tasks_list = [] - if ti._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q): - res = dpt.empty( - res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q - ) - ht_e, _ = ti._sum_over_axis( - src=arr2, trailing_dims_to_reduce=red_nd, dst=res, sycl_queue=q - ) - host_tasks_list.append(ht_e) + res = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + ) + hev, _ = _reduction_fn( + src=x_tmp, + trailing_dims_to_reduce=red_nd, + dst=res, + sycl_queue=exec_q, + ) + + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm) + hev.wait() + return res + + +def max(x, axis=None, keepdims=False): + """max(x, axis=None, keepdims=False) + + Calculates the maximum value of the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which maxima must be computed. If a tuple + of unique integers, the maxima are computed over multiple axes. + If `None`, the max is computed over the entire array. + Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the maxima. If the max was computed over the + entire array, a zero-dimensional array is returned. The returned + array has the same data type as `x`. + """ + return _comparison_over_axis(x, axis, keepdims, tri._max_over_axis) + + +def min(x, axis=None, keepdims=False): + """min(x, axis=None, keepdims=False) + + Calculates the minimum value of the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which minima must be computed. If a tuple + of unique integers, the minima are computed over multiple axes. + If `None`, the min is computed over the entire array. + Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the minima. If the min was computed over the + entire array, a zero-dimensional array is returned. The returned + array has the same data type as `x`. + """ + return _comparison_over_axis(x, axis, keepdims, tri._min_over_axis) + + +def _search_over_axis(x, axis, keepdims, _reduction_fn): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + + nd = x.ndim + if axis is None: + axis = tuple(range(nd)) + elif isinstance(axis, int): + axis = (axis,) else: - if dtype is None: - raise RuntimeError( - "Automatically determined reduction data type does not " - "have direct implementation" - ) - tmp_dt = _default_reduction_dtype(inp_dt, q) - tmp = dpt.empty( - res_shape, dtype=tmp_dt, usm_type=res_usm_type, sycl_queue=q - ) - ht_e_tmp, r_e = ti._sum_over_axis( - src=arr2, trailing_dims_to_reduce=red_nd, dst=tmp, sycl_queue=q - ) - host_tasks_list.append(ht_e_tmp) - res = dpt.empty( - res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + raise TypeError( + f"`axis` argument expected `int` or `None`, got {type(axis)}" ) - ht_e, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=tmp, dst=res, sycl_queue=q, depends=[r_e] + axis = normalize_axis_tuple(axis, nd, "axis") + red_nd = len(axis) + perm = [i for i in range(nd) if i not in axis] + list(axis) + x_tmp = dpt.permute_dims(x, perm) + res_shape = x_tmp.shape[: nd - red_nd] + exec_q = x.sycl_queue + res_dt = ti.default_device_index_type(exec_q.sycl_device) + res_usm_type = x.usm_type + if x.size == 0: + if any([x.shape[i] == 0 for i in axis]): + raise ValueError( + "reduction cannot be performed over zero-size axes" + ) + else: + return dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + ) + if red_nd == 0: + return dpt.zeros( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q ) - host_tasks_list.append(ht_e) + + res = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + ) + hev, _ = _reduction_fn( + src=x_tmp, + trailing_dims_to_reduce=red_nd, + dst=res, + sycl_queue=exec_q, + ) if keepdims: res_shape = res_shape + (1,) * red_nd inv_perm = sorted(range(nd), key=lambda d: perm[d]) res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm) - dpctl.SyclEvent.wait_for(host_tasks_list) - + hev.wait() return res + + +def argmax(x, axis=None, keepdims=False): + """argmax(x, axis=None, keepdims=False) + + Returns the indices of the maximum values of the input array `x` along a + specified axis. + + When the maximum value occurs multiple times, the indices corresponding to + the first occurrence are returned. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which to search. If `None`, returns the index of the + maximum value of the flattened array. + Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the indices of the first occurrence of the + maximum values. If the entire array was searched, a + zero-dimensional array is returned. The returned array has the + default array index data type for the device of `x`. + """ + return _search_over_axis(x, axis, keepdims, tri._argmax_over_axis) + + +def argmin(x, axis=None, keepdims=False): + """argmin(x, axis=None, keepdims=False) + + Returns the indices of the minimum values of the input array `x` along a + specified axis. + + When the minimum value occurs multiple times, the indices corresponding to + the first occurrence are returned. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which to search. If `None`, returns the index of the + minimum value of the flattened array. + Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the indices of the first occurrence of the + minimum values. If the entire array was searched, a + zero-dimensional array is returned. The returned array has the + default array index data type for the device of `x`. + """ + return _search_over_axis(x, axis, keepdims, tri._argmin_over_axis) diff --git a/dpctl/tensor/_statistical_functions.py b/dpctl/tensor/_statistical_functions.py new file mode 100644 index 0000000000..54d748d2d2 --- /dev/null +++ b/dpctl/tensor/_statistical_functions.py @@ -0,0 +1,381 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from numpy.core.numeric import normalize_axis_tuple + +import dpctl +import dpctl.tensor as dpt +import dpctl.tensor._tensor_elementwise_impl as tei +import dpctl.tensor._tensor_impl as ti +import dpctl.tensor._tensor_reductions_impl as tri + + +def _var_impl(x, axis, correction, keepdims): + nd = x.ndim + if axis is None: + axis = tuple(range(nd)) + if not isinstance(axis, (tuple, list)): + axis = (axis,) + axis = normalize_axis_tuple(axis, nd, "axis") + perm = [] + nelems = 1 + for i in range(nd): + if i not in axis: + perm.append(i) + else: + nelems *= x.shape[i] + red_nd = len(axis) + perm = perm + list(axis) + q = x.sycl_queue + inp_dt = x.dtype + res_dt = ( + inp_dt + if inp_dt.kind == "f" + else dpt.dtype(ti.default_device_fp_type(q)) + ) + res_usm_type = x.usm_type + + deps = [] + host_tasks_list = [] + if inp_dt != res_dt: + buf = dpt.empty_like(x, dtype=res_dt) + ht_e_buf, c_e1 = ti._copy_usm_ndarray_into_usm_ndarray( + src=x, dst=buf, sycl_queue=q + ) + deps.append(c_e1) + host_tasks_list.append(ht_e_buf) + else: + buf = x + # calculate mean + buf2 = dpt.permute_dims(buf, perm) + res_shape = buf2.shape[: nd - red_nd] + # use keepdims=True path for later broadcasting + if red_nd == 0: + mean_ary = dpt.empty_like(buf) + ht_e1, c_e2 = ti._copy_usm_ndarray_into_usm_ndarray( + src=buf, dst=mean_ary, sycl_queue=q + ) + deps.append(c_e2) + host_tasks_list.append(ht_e1) + else: + mean_ary = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=q, + ) + ht_e1, r_e1 = tri._sum_over_axis( + src=buf2, + trailing_dims_to_reduce=red_nd, + dst=mean_ary, + sycl_queue=q, + depends=deps, + ) + host_tasks_list.append(ht_e1) + deps.append(r_e1) + + mean_ary_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + mean_ary = dpt.permute_dims( + dpt.reshape(mean_ary, mean_ary_shape), inv_perm + ) + # divide in-place to get mean + mean_ary_shape = mean_ary.shape + nelems_ary = dpt.asarray( + nelems, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + if nelems_ary.shape != mean_ary_shape: + nelems_ary = dpt.broadcast_to(nelems_ary, mean_ary_shape) + ht_e2, d_e1 = tei._divide_inplace( + lhs=mean_ary, rhs=nelems_ary, sycl_queue=q, depends=deps + ) + host_tasks_list.append(ht_e2) + # subtract mean from original array to get deviations + dev_ary = dpt.empty_like(buf) + if mean_ary_shape != buf.shape: + mean_ary = dpt.broadcast_to(mean_ary, buf.shape) + ht_e4, su_e = tei._subtract( + src1=buf, src2=mean_ary, dst=dev_ary, sycl_queue=q, depends=[d_e1] + ) + host_tasks_list.append(ht_e4) + # square deviations + ht_e5, sq_e = tei._square( + src=dev_ary, dst=dev_ary, sycl_queue=q, depends=[su_e] + ) + host_tasks_list.append(ht_e5) + deps2 = [] + # take sum of squared deviations + dev_ary2 = dpt.permute_dims(dev_ary, perm) + if red_nd == 0: + res = dev_ary + deps2.append(sq_e) + else: + res = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=q, + ) + ht_e6, r_e2 = tri._sum_over_axis( + src=dev_ary2, + trailing_dims_to_reduce=red_nd, + dst=res, + sycl_queue=q, + depends=[sq_e], + ) + host_tasks_list.append(ht_e6) + deps2.append(r_e2) + + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm) + res_shape = res.shape + # when nelems - correction <= 0, yield nans + div = max(nelems - correction, 0) + if not div: + div = dpt.nan + div_ary = dpt.asarray(div, res_dt, usm_type=res_usm_type, sycl_queue=q) + # divide in-place again + if div_ary.shape != res_shape: + div_ary = dpt.broadcast_to(div_ary, res.shape) + ht_e7, d_e2 = tei._divide_inplace( + lhs=res, rhs=div_ary, sycl_queue=q, depends=deps2 + ) + host_tasks_list.append(ht_e7) + return res, [d_e2], host_tasks_list + + +def mean(x, axis=None, keepdims=False): + """mean(x, axis=None, keepdims=False) + + Calculates the arithmetic mean of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which the arithmetic means must be computed. If + a tuple of unique integers, the means are computed over multiple + axes. If `None`, the mean is computed over the entire array. + Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input array according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the arithmetic means. If the mean was computed + over the entire array, a zero-dimensional array is returned. + + If `x` has a floating-point data type, the returned array will have + the same data type as `x`. + If `x` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array `x` is allocated. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + nd = x.ndim + if axis is None: + axis = tuple(range(nd)) + if not isinstance(axis, (tuple, list)): + axis = (axis,) + axis = normalize_axis_tuple(axis, nd, "axis") + perm = [] + nelems = 1 + for i in range(nd): + if i not in axis: + perm.append(i) + else: + nelems *= x.shape[i] + sum_nd = len(axis) + perm = perm + list(axis) + arr2 = dpt.permute_dims(x, perm) + res_shape = arr2.shape[: nd - sum_nd] + q = x.sycl_queue + inp_dt = x.dtype + res_dt = ( + x.dtype + if x.dtype.kind in "fc" + else dpt.dtype(ti.default_device_fp_type(q)) + ) + res_usm_type = x.usm_type + if sum_nd == 0: + return dpt.astype(x, res_dt, copy=True) + + s_e = [] + host_tasks_list = [] + if tri._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q): + res = dpt.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e1, r_e = tri._sum_over_axis( + src=arr2, trailing_dims_to_reduce=sum_nd, dst=res, sycl_queue=q + ) + host_tasks_list.append(ht_e1) + s_e.append(r_e) + else: + tmp = dpt.empty( + arr2.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr2, dst=tmp, sycl_queue=q + ) + host_tasks_list.append(ht_e_cpy) + res = dpt.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_red, r_e = tri._sum_over_axis( + src=tmp, + trailing_dims_to_reduce=sum_nd, + dst=res, + sycl_queue=q, + depends=[cpy_e], + ) + host_tasks_list.append(ht_e_red) + s_e.append(r_e) + + if keepdims: + res_shape = res_shape + (1,) * sum_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm) + + res_shape = res.shape + # in-place divide + den_dt = dpt.finfo(res_dt).dtype if res_dt.kind == "c" else res_dt + nelems_arr = dpt.asarray( + nelems, dtype=den_dt, usm_type=res_usm_type, sycl_queue=q + ) + if nelems_arr.shape != res_shape: + nelems_arr = dpt.broadcast_to(nelems_arr, res_shape) + ht_e2, _ = tei._divide_inplace( + lhs=res, rhs=nelems_arr, sycl_queue=q, depends=s_e + ) + host_tasks_list.append(ht_e2) + dpctl.SyclEvent.wait_for(host_tasks_list) + return res + + +def var(x, axis=None, correction=0.0, keepdims=False): + """var(x, axis=None, correction=0.0, keepdims=False) + + Calculates the variance of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which the variances must be computed. If a tuple + of unique integers, the variances are computed over multiple axes. + If `None`, the variance is computed over the entire array. + Default: `None`. + correction (Optional[float, int]): + degrees of freedom adjustment. The divisor used in calculating the + variance is `N-correction`, where `N` corresponds to the total + number of elements over which the variance is calculated. + Default: `0.0`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input array according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the variances. If the variance was computed + over the entire array, a zero-dimensional array is returned. + + If `x` has a real-valued floating-point data type, the returned + array will have the same data type as `x`. + If `x` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array `x` is allocated. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + + if not isinstance(correction, (int, float)): + raise TypeError( + "Expected a Python integer or float for `correction`, got" + f"{type(x)}" + ) + + if x.dtype.kind == "c": + raise ValueError("`var` does not support complex types") + + res, _, host_tasks_list = _var_impl(x, axis, correction, keepdims) + dpctl.SyclEvent.wait_for(host_tasks_list) + return res + + +def std(x, axis=None, correction=0.0, keepdims=False): + """std(x, axis=None, correction=0.0, keepdims=False) + + Calculates the standard deviation of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which the standard deviations must be computed. + If a tuple of unique integers, the standard deviations are computed + over multiple axes. If `None`, the standard deviation is computed + over the entire array. Default: `None`. + correction (Optional[float, int]): + degrees of freedom adjustment. The divisor used in calculating the + standard deviation is `N-correction`, where `N` corresponds to the + total number of elements over which the standard deviation is + calculated. Default: `0.0`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input array according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the standard deviations. If the standard + deviation was computed over the entire array, a zero-dimensional + array is returned. + + If `x` has a real-valued floating-point data type, the returned + array will have the same data type as `x`. + If `x` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array `x` is allocated. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + + if not isinstance(correction, (int, float)): + raise TypeError( + "Expected a Python integer or float for `correction`," + f"got {type(x)}" + ) + + if x.dtype.kind == "c": + raise ValueError("`std` does not support complex types") + + res, deps, host_tasks_list = _var_impl(x, axis, correction, keepdims) + ht_ev, _ = tei._sqrt( + src=res, dst=res, sycl_queue=res.sycl_queue, depends=deps + ) + host_tasks_list.append(ht_ev) + dpctl.SyclEvent.wait_for(host_tasks_list) + return res diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index ba18600135..5b394d971b 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -182,13 +182,20 @@ cdef class usm_ndarray: cdef bint is_fp16 = False self._reset() - if (not isinstance(shape, (list, tuple)) - and not hasattr(shape, 'tolist')): - try: - shape - shape = [shape, ] - except Exception: - raise TypeError("Argument shape must be a list or a tuple.") + if not isinstance(shape, (list, tuple)): + if hasattr(shape, 'tolist'): + fn = getattr(shape, 'tolist') + if callable(fn): + shape = shape.tolist() + if not isinstance(shape, (list, tuple)): + try: + shape + shape = [shape, ] + except Exception as e: + raise TypeError( + "Argument shape must a non-negative integer, " + "or a list/tuple of such integers." + ) from e nd = len(shape) if dtype is None: if isinstance(buffer, (dpmem._memory._Memory, usm_ndarray)): diff --git a/dpctl/tensor/_utility_functions.py b/dpctl/tensor/_utility_functions.py index 500c997e8f..69a1a200df 100644 --- a/dpctl/tensor/_utility_functions.py +++ b/dpctl/tensor/_utility_functions.py @@ -3,6 +3,7 @@ import dpctl import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti +import dpctl.tensor._tensor_reductions_impl as tri def _boolean_reduction(x, axis, keepdims, func): @@ -94,7 +95,7 @@ def all(x, axis=None, keepdims=False): An array with a data type of `bool` containing the results of the logical AND reduction. """ - return _boolean_reduction(x, axis, keepdims, ti._all) + return _boolean_reduction(x, axis, keepdims, tri._all) def any(x, axis=None, keepdims=False): @@ -122,4 +123,4 @@ def any(x, axis=None, keepdims=False): An array with a data type of `bool` containing the results of the logical OR reduction. """ - return _boolean_reduction(x, axis, keepdims, ti._any) + return _boolean_reduction(x, axis, keepdims, tri._any) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index 110010706c..a8ef1c423e 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -125,54 +125,56 @@ sycl::event inclusive_scan_rec(sycl::queue &exec_q, auto lws = sycl::range<1>(wg_size); auto gws = sycl::range<1>(n_groups * wg_size); + auto ndRange = sycl::nd_range<1>(gws, lws); + slmT slm_iscan_tmp(lws, cgh); - cgh.parallel_for>( - sycl::nd_range<1>(gws, lws), [=, slm_iscan_tmp = std::move(slm_iscan_tmp)](sycl::nd_item<1> it) - { - auto chunk_gid = it.get_global_id(0); - auto lid = it.get_local_id(0); + using KernelName = inclusive_scan_rec_local_scan_krn< + inputT, outputT, n_wi, IndexerT, decltype(transformer)>; + + cgh.parallel_for(ndRange, [=, slm_iscan_tmp = std::move( + slm_iscan_tmp)]( + sycl::nd_item<1> it) { + auto chunk_gid = it.get_global_id(0); + auto lid = it.get_local_id(0); - std::array local_isum; + std::array local_isum; - size_t i = chunk_gid * n_wi; - for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { - constexpr outputT out_zero(0); + size_t i = chunk_gid * n_wi; + for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { + constexpr outputT out_zero(0); - local_isum[m_wi] = - (i + m_wi < n_elems) - ? transformer(input[indexer(s0 + s1 * (i + m_wi))]) - : out_zero; - } + local_isum[m_wi] = + (i + m_wi < n_elems) + ? transformer(input[indexer(s0 + s1 * (i + m_wi))]) + : out_zero; + } -// local_isum is now result of -// inclusive scan of locally stored mask indicators #pragma unroll - for (size_t m_wi = 1; m_wi < n_wi; ++m_wi) { - local_isum[m_wi] += local_isum[m_wi - 1]; - } + for (size_t m_wi = 1; m_wi < n_wi; ++m_wi) { + local_isum[m_wi] += local_isum[m_wi - 1]; + } + // local_isum is now result of + // inclusive scan of locally stored inputs - size_t wg_iscan_val = - sycl::inclusive_scan_over_group(it.get_group(), - local_isum.back(), - sycl::plus(), - size_t(0)); + size_t wg_iscan_val = sycl::inclusive_scan_over_group( + it.get_group(), local_isum.back(), sycl::plus(), + size_t(0)); - slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; - it.barrier(sycl::access::fence_space::local_space); - size_t addand = (lid == 0) ? 0 : slm_iscan_tmp[lid]; - it.barrier(sycl::access::fence_space::local_space); + slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; + it.barrier(sycl::access::fence_space::local_space); + size_t addand = (lid == 0) ? 0 : slm_iscan_tmp[lid]; #pragma unroll - for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { - local_isum[m_wi] += addand; - } - - for (size_t m_wi = 0; m_wi < n_wi && i + m_wi < n_elems; ++m_wi) { - output[i + m_wi] = local_isum[m_wi]; - } - }); + for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { + local_isum[m_wi] += addand; + } + + for (size_t m_wi = 0; m_wi < n_wi && i + m_wi < n_elems; ++m_wi) + { + output[i + m_wi] = local_isum[m_wi]; + } + }); }); sycl::event out_event = inc_scan_phase1_ev; diff --git a/dpctl/tensor/libtensor/include/kernels/clip.hpp b/dpctl/tensor/libtensor/include/kernels/clip.hpp new file mode 100644 index 0000000000..9cca9f615b --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/clip.hpp @@ -0,0 +1,311 @@ +//=== clip.hpp - Implementation of clip kernels ---*-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for dpctl.tensor.clip. +//===----------------------------------------------------------------------===// + +#pragma once +#include "pybind11/numpy.h" +#include "pybind11/stl.h" +#include +#include +#include +#include +#include +#include + +#include "utils/math_utils.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace clip +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using namespace dpctl::tensor::offset_utils; + +template T clip(const T &x, const T &min, const T &max) +{ + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::max_complex; + using dpctl::tensor::math_utils::min_complex; + return min_complex(max_complex(x, min), max); + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + auto tmp = (std::isnan(x) || x > min) ? x : min; + return (std::isnan(tmp) || tmp < max) ? tmp : max; + } + else if constexpr (std::is_same_v) { + return (x || min) && max; + } + else { + auto tmp = (x > min) ? x : min; + return (tmp < max) ? tmp : max; + } +} + +template class ClipContigFunctor +{ +private: + size_t nelems = 0; + const T *x_p = nullptr; + const T *min_p = nullptr; + const T *max_p = nullptr; + T *dst_p = nullptr; + +public: + ClipContigFunctor(size_t nelems_, + const T *x_p_, + const T *min_p_, + const T *max_p_, + T *dst_p_) + : nelems(nelems_), x_p(x_p_), min_p(min_p_), max_p(max_p_), + dst_p(dst_p_) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0]; + size_t base = ndit.get_global_linear_id(); + + base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize); + for (size_t offset = base; + offset < std::min(nelems, base + sgSize * (n_vecs * vec_sz)); + offset += sgSize) + { + dst_p[offset] = clip(x_p[offset], min_p[offset], max_p[offset]); + } + } + else { + auto sg = ndit.get_sub_group(); + std::uint8_t sgSize = sg.get_local_range()[0]; + std::uint8_t max_sgSize = sg.get_max_local_range()[0]; + size_t base = n_vecs * vec_sz * + (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * max_sgSize); + + if (base + n_vecs * vec_sz * sgSize < nelems && + sgSize == max_sgSize) { + sycl::vec x_vec; + sycl::vec min_vec; + sycl::vec max_vec; + sycl::vec dst_vec; +#pragma unroll + for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + auto idx = base + it * sgSize; + auto x_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&x_p[idx]); + auto min_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&min_p[idx]); + auto max_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&max_p[idx]); + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&dst_p[idx]); + + x_vec = sg.load(x_multi_ptr); + min_vec = sg.load(min_multi_ptr); + max_vec = sg.load(max_multi_ptr); +#pragma unroll + for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) { + dst_vec[vec_id] = clip(x_vec[vec_id], min_vec[vec_id], + max_vec[vec_id]); + } + sg.store(dst_multi_ptr, dst_vec); + } + } + else { + for (size_t k = base + sg.get_local_id()[0]; k < nelems; + k += sgSize) { + dst_p[k] = clip(x_p[k], min_p[k], max_p[k]); + } + } + } + } +}; + +template class clip_contig_kernel; + +typedef sycl::event (*clip_contig_impl_fn_ptr_t)( + sycl::queue &, + size_t, + const char *, + const char *, + const char *, + char *, + const std::vector &); + +template +sycl::event clip_contig_impl(sycl::queue &q, + size_t nelems, + const char *x_cp, + const char *min_cp, + const char *max_cp, + char *dst_cp, + const std::vector &depends) +{ + const T *x_tp = reinterpret_cast(x_cp); + const T *min_tp = reinterpret_cast(min_cp); + const T *max_tp = reinterpret_cast(max_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + sycl::event clip_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + size_t lws = 64; + constexpr unsigned int vec_sz = 4; + constexpr unsigned int n_vecs = 2; + const size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + cgh.parallel_for>( + sycl::nd_range<1>(gws_range, lws_range), + ClipContigFunctor(nelems, x_tp, min_tp, max_tp, + dst_tp)); + }); + + return clip_ev; +} + +template class ClipStridedFunctor +{ +private: + const T *x_p = nullptr; + const T *min_p = nullptr; + const T *max_p = nullptr; + T *dst_p = nullptr; + IndexerT indexer; + +public: + ClipStridedFunctor(const T *x_p_, + const T *min_p_, + const T *max_p_, + T *dst_p_, + IndexerT indexer_) + : x_p(x_p_), min_p(min_p_), max_p(max_p_), dst_p(dst_p_), + indexer(indexer_) + { + } + + void operator()(sycl::id<1> id) const + { + size_t gid = id[0]; + auto offsets = indexer(static_cast(gid)); + dst_p[offsets.get_fourth_offset()] = clip( + x_p[offsets.get_first_offset()], min_p[offsets.get_second_offset()], + max_p[offsets.get_third_offset()]); + } +}; + +template class clip_strided_kernel; + +typedef sycl::event (*clip_strided_impl_fn_ptr_t)( + sycl::queue &, + size_t, + int, + const char *, + const char *, + const char *, + char *, + const py::ssize_t *, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + const std::vector &); + +template +sycl::event clip_strided_impl(sycl::queue &q, + size_t nelems, + int nd, + const char *x_cp, + const char *min_cp, + const char *max_cp, + char *dst_cp, + const py::ssize_t *shape_strides, + py::ssize_t x_offset, + py::ssize_t min_offset, + py::ssize_t max_offset, + py::ssize_t dst_offset, + const std::vector &depends) +{ + const T *x_tp = reinterpret_cast(x_cp); + const T *min_tp = reinterpret_cast(min_cp); + const T *max_tp = reinterpret_cast(max_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + sycl::event clip_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + FourOffsets_StridedIndexer indexer{ + nd, x_offset, min_offset, max_offset, dst_offset, shape_strides}; + + cgh.parallel_for>( + sycl::range<1>(nelems), + ClipStridedFunctor( + x_tp, min_tp, max_tp, dst_tp, indexer)); + }); + + return clip_ev; +} + +template struct ClipStridedFactory +{ + fnT get() + { + fnT fn = clip_strided_impl; + return fn; + } +}; + +template struct ClipContigFactory +{ + fnT get() + { + + fnT fn = clip_contig_impl; + return fn; + } +}; + +} // namespace clip +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp index 016b3a05d3..d88d17d3e3 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp @@ -33,6 +33,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl @@ -257,6 +258,144 @@ struct BitwiseAndStridedFactory } }; +template struct BitwiseAndInplaceFunctor +{ + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + void operator()(resT &res, const argT &in) const + { + using tu_ns::convert_impl; + + if constexpr (std::is_same_v) { + res = res && in; + } + else { + res &= in; + } + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) const + { + + if constexpr (std::is_same_v) { + using dpctl::tensor::type_utils::vec_cast; + + auto tmp = (res && in); + res = vec_cast( + tmp); + } + else { + res &= in; + } + } +}; + +template +using BitwiseAndInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + BitwiseAndInplaceFunctor, + vec_sz, + n_vecs>; + +template +using BitwiseAndInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + BitwiseAndInplaceFunctor>; + +template +class bitwise_and_inplace_contig_kernel; + +template +sycl::event +bitwise_and_inplace_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, BitwiseAndInplaceContigFunctor, + bitwise_and_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset, + res_p, res_offset, depends); +} + +template +struct BitwiseAndInplaceContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename BitwiseAndOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_and_inplace_contig_impl; + return fn; + } + } +}; + +template +class bitwise_and_inplace_strided_kernel; + +template +sycl::event bitwise_and_inplace_strided_impl( + sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, BitwiseAndInplaceStridedFunctor, + bitwise_and_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct BitwiseAndInplaceStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename BitwiseAndOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_and_inplace_strided_impl; + return fn; + } + } +}; + } // namespace bitwise_and } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp index 9ce56be966..ed4aeeb59e 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp @@ -35,6 +35,8 @@ #include "utils/type_utils.hpp" #include +#include "kernels/elementwise_functions/common.hpp" + namespace dpctl { namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp index 4ae04f97de..5cfd6ca5e3 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp @@ -34,6 +34,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl @@ -268,6 +269,150 @@ struct BitwiseLeftShiftStridedFactory } }; +template struct BitwiseLeftShiftInplaceFunctor +{ + static_assert(std::is_integral_v); + static_assert(!std::is_same_v); + + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + void operator()(resT &res, const argT &in) const + { + impl(res, in); + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) const + { +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + impl(res[i], in[i]); + } + } + +private: + void impl(resT &res, const argT &in) const + { + constexpr argT res_bitsize = static_cast(sizeof(resT) * 8); + constexpr resT zero = resT(0); + + // bitshift op with second operand negative, or >= bitwidth(argT1) is UB + // array API spec mandates 0 + if constexpr (std::is_unsigned_v) { + (in < res_bitsize) ? (res <<= in) : res = zero; + } + else { + (in < argT(0)) ? res = zero + : ((in < res_bitsize) ? (res <<= in) : res = zero); + } + } +}; + +template +using BitwiseLeftShiftInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + BitwiseLeftShiftInplaceFunctor, + vec_sz, + n_vecs>; + +template +using BitwiseLeftShiftInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + BitwiseLeftShiftInplaceFunctor>; + +template +class bitwise_left_shift_inplace_contig_kernel; + +template +sycl::event bitwise_left_shift_inplace_contig_impl( + sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, BitwiseLeftShiftInplaceContigFunctor, + bitwise_left_shift_inplace_contig_kernel>( + exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends); +} + +template +struct BitwiseLeftShiftInplaceContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_left_shift_inplace_contig_impl; + return fn; + } + } +}; + +template +class bitwise_left_shift_inplace_strided_kernel; + +template +sycl::event bitwise_left_shift_inplace_strided_impl( + sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, BitwiseLeftShiftInplaceStridedFunctor, + bitwise_left_shift_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct BitwiseLeftShiftInplaceStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_left_shift_inplace_strided_impl; + return fn; + } + } +}; + } // namespace bitwise_left_shift } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp index 65f25dd296..d5669d41b1 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp @@ -33,6 +33,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl @@ -253,6 +254,144 @@ template struct BitwiseOrStridedFactory } }; +template struct BitwiseOrInplaceFunctor +{ + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + void operator()(resT &res, const argT &in) const + { + using tu_ns::convert_impl; + + if constexpr (std::is_same_v) { + res = res || in; + } + else { + res |= in; + } + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) const + { + + if constexpr (std::is_same_v) { + using dpctl::tensor::type_utils::vec_cast; + + auto tmp = (res || in); + res = vec_cast( + tmp); + } + else { + res |= in; + } + } +}; + +template +using BitwiseOrInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + BitwiseOrInplaceFunctor, + vec_sz, + n_vecs>; + +template +using BitwiseOrInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + BitwiseOrInplaceFunctor>; + +template +class bitwise_or_inplace_contig_kernel; + +template +sycl::event +bitwise_or_inplace_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, BitwiseOrInplaceContigFunctor, + bitwise_or_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset, + res_p, res_offset, depends); +} + +template +struct BitwiseOrInplaceContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename BitwiseOrOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_or_inplace_contig_impl; + return fn; + } + } +}; + +template +class bitwise_or_inplace_strided_kernel; + +template +sycl::event bitwise_or_inplace_strided_impl( + sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, BitwiseOrInplaceStridedFunctor, + bitwise_or_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct BitwiseOrInplaceStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename BitwiseOrOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_or_inplace_strided_impl; + return fn; + } + } +}; + } // namespace bitwise_or } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp index 9442d4f6b7..5a04165701 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp @@ -34,6 +34,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl @@ -270,6 +271,152 @@ struct BitwiseRightShiftStridedFactory } }; +template struct BitwiseRightShiftInplaceFunctor +{ + static_assert(std::is_integral_v); + static_assert(!std::is_same_v); + + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + void operator()(resT &res, const argT &in) const + { + impl(res, in); + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) const + { +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + impl(res[i], in[i]); + } + } + +private: + void impl(resT &res, const argT &in) const + { + constexpr argT res_bitsize = static_cast(sizeof(resT) * 8); + constexpr resT zero = resT(0); + + // bitshift op with second operand negative, or >= bitwidth(argT1) is UB + // array API spec mandates 0 + if constexpr (std::is_unsigned_v) { + (in < res_bitsize) ? (res >>= in) : res = zero; + } + else { + (in < argT(0)) ? res = zero + : ((in < res_bitsize) ? (res >>= in) + : (res < resT(0)) ? res = resT(-1) + : res = zero); + } + } +}; + +template +using BitwiseRightShiftInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + BitwiseRightShiftInplaceFunctor, + vec_sz, + n_vecs>; + +template +using BitwiseRightShiftInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + BitwiseRightShiftInplaceFunctor>; + +template +class bitwise_right_shift_inplace_contig_kernel; + +template +sycl::event bitwise_right_shift_inplace_contig_impl( + sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, BitwiseRightShiftInplaceContigFunctor, + bitwise_right_shift_inplace_contig_kernel>( + exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends); +} + +template +struct BitwiseRightShiftInplaceContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_right_shift_inplace_contig_impl; + return fn; + } + } +}; + +template +class bitwise_right_shift_inplace_strided_kernel; + +template +sycl::event bitwise_right_shift_inplace_strided_impl( + sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, BitwiseRightShiftInplaceStridedFunctor, + bitwise_right_shift_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct BitwiseRightShiftInplaceStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_right_shift_inplace_strided_impl; + return fn; + } + } +}; + } // namespace bitwise_right_shift } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp index 2b0ab09dca..ec8192fd0f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp @@ -33,6 +33,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl @@ -257,6 +258,144 @@ struct BitwiseXorStridedFactory } }; +template struct BitwiseXorInplaceFunctor +{ + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + void operator()(resT &res, const argT &in) const + { + using tu_ns::convert_impl; + + if constexpr (std::is_same_v) { + res = (res != in); + } + else { + res ^= in; + } + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) const + { + + if constexpr (std::is_same_v) { + using dpctl::tensor::type_utils::vec_cast; + + auto tmp = (res != in); + res = vec_cast( + tmp); + } + else { + res ^= in; + } + } +}; + +template +using BitwiseXorInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + BitwiseXorInplaceFunctor, + vec_sz, + n_vecs>; + +template +using BitwiseXorInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + BitwiseXorInplaceFunctor>; + +template +class bitwise_xor_inplace_contig_kernel; + +template +sycl::event +bitwise_xor_inplace_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, BitwiseXorInplaceContigFunctor, + bitwise_xor_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset, + res_p, res_offset, depends); +} + +template +struct BitwiseXorInplaceContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename BitwiseXorOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_xor_inplace_contig_impl; + return fn; + } + } +}; + +template +class bitwise_xor_inplace_strided_kernel; + +template +sycl::event bitwise_xor_inplace_strided_impl( + sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, BitwiseXorInplaceStridedFunctor, + bitwise_xor_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct BitwiseXorInplaceStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename BitwiseXorOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_xor_inplace_strided_impl; + return fn; + } + } +}; + } // namespace bitwise_xor } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp new file mode 100644 index 0000000000..1d4aa65002 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp @@ -0,0 +1,172 @@ +//=== cbrt.hpp - Unary function CBRT ------ *-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of CBRT(x) +/// function that compute a square root. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace cbrt +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +template struct CbrtFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::true_type; + + resT operator()(const argT &in) const + { + return sycl::cbrt(in); + } +}; + +template +using CbrtContigFunctor = elementwise_common:: + UnaryContigFunctor, vec_sz, n_vecs>; + +template +using CbrtStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template struct CbrtOutputType +{ + using value_type = typename std::disjunction< // disjunction is C++17 + // feature, supported by DPC++ + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; +}; + +template +class cbrt_contig_kernel; + +template +sycl::event cbrt_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + return elementwise_common::unary_contig_impl< + argTy, CbrtOutputType, CbrtContigFunctor, cbrt_contig_kernel>( + exec_q, nelems, arg_p, res_p, depends); +} + +template struct CbrtContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = cbrt_contig_impl; + return fn; + } + } +}; + +template struct CbrtTypeMapFactory +{ + /*! @brief get typeid for output type of std::cbrt(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename CbrtOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template class cbrt_strided_kernel; + +template +sycl::event +cbrt_strided_impl(sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, CbrtOutputType, CbrtStridedFunctor, cbrt_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template struct CbrtStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = cbrt_strided_impl; + return fn; + } + } +}; + +} // namespace cbrt +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp new file mode 100644 index 0000000000..b1997d06b4 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp @@ -0,0 +1,215 @@ +//=== copysign.hpp - Binary function COPYSIGN ------ *-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of COPYSIGN(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace copysign +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template struct CopysignFunctor +{ + + using supports_sg_loadstore = std::true_type; + using supports_vec = std::true_type; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + return sycl::copysign(in1, in2); + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + auto tmp = sycl::copysign(in1, in2); + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using CopysignContigFunctor = + elementwise_common::BinaryContigFunctor, + vec_sz, + n_vecs>; + +template +using CopysignStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + CopysignFunctor>; + +template struct CopysignOutputType +{ + using value_type = typename std::disjunction< // disjunction is C++17 + // feature, supported by DPC++ + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; +}; + +template +class copysign_contig_kernel; + +template +sycl::event copysign_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg1_p, + py::ssize_t arg1_offset, + const char *arg2_p, + py::ssize_t arg2_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_contig_impl< + argTy1, argTy2, CopysignOutputType, CopysignContigFunctor, + copysign_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends); +} + +template struct CopysignContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename CopysignOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = copysign_contig_impl; + return fn; + } + } +}; + +template struct CopysignTypeMapFactory +{ + /*! @brief get typeid for output type of divide(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename CopysignOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class copysign_strided_kernel; + +template +sycl::event +copysign_strided_impl(sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg1_p, + py::ssize_t arg1_offset, + const char *arg2_p, + py::ssize_t arg2_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, CopysignOutputType, CopysignStridedFunctor, + copysign_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p, + arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends, additional_depends); +} + +template struct CopysignStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename CopysignOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = copysign_strided_impl; + return fn; + } + } +}; + +} // namespace copysign +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp new file mode 100644 index 0000000000..67ee23df48 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp @@ -0,0 +1,229 @@ +//=== exp2.hpp - Unary function EXP2 ------ +//*-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of EXP2(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace exp2 +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template struct Exp2Functor +{ + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + + const argT tmp = in * std::log(realT(2)); + + constexpr realT q_nan = std::numeric_limits::quiet_NaN(); + + const realT x = std::real(tmp); + const realT y = std::imag(tmp); + if (std::isfinite(x)) { + if (std::isfinite(y)) { + return std::exp(tmp); + } + else { + return resT{q_nan, q_nan}; + } + } + else if (std::isnan(x)) { + /* x is nan */ + if (y == realT(0)) { + return resT{in}; + } + else { + return resT{x, q_nan}; + } + } + else { + if (!std::signbit(x)) { /* x is +inf */ + if (y == realT(0)) { + return resT{x, y}; + } + else if (std::isfinite(y)) { + return resT{x * std::cos(y), x * std::sin(y)}; + } + else { + /* x = +inf, y = +-inf || nan */ + return resT{x, q_nan}; + } + } + else { /* x is -inf */ + if (std::isfinite(y)) { + realT exp_x = std::exp(x); + return resT{exp_x * std::cos(y), exp_x * std::sin(y)}; + } + else { + /* x = -inf, y = +-inf || nan */ + return resT{0, 0}; + } + } + } + } + else { + return sycl::exp2(in); + } + } +}; + +template +using Exp2ContigFunctor = elementwise_common:: + UnaryContigFunctor, vec_sz, n_vecs>; + +template +using Exp2StridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template struct Exp2OutputType +{ + using value_type = typename std::disjunction< // disjunction is C++17 + // feature, supported by DPC++ + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; +}; + +template +class exp2_contig_kernel; + +template +sycl::event exp2_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + return elementwise_common::unary_contig_impl< + argTy, Exp2OutputType, Exp2ContigFunctor, exp2_contig_kernel>( + exec_q, nelems, arg_p, res_p, depends); +} + +template struct Exp2ContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = exp2_contig_impl; + return fn; + } + } +}; + +template struct Exp2TypeMapFactory +{ + /*! @brief get typeid for output type of std::exp2(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename Exp2OutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template class exp2_strided_kernel; + +template +sycl::event +exp2_strided_impl(sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, Exp2OutputType, Exp2StridedFunctor, exp2_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template struct Exp2StridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = exp2_strided_impl; + return fn; + } + } +}; + +} // namespace exp2 +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp index ad75924070..241c0e7ca8 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp @@ -34,6 +34,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl @@ -57,12 +58,7 @@ struct FloorDivideFunctor resT operator()(const argT1 &in1, const argT2 &in2) const { - if constexpr (std::is_same_v && - std::is_same_v) { - return (in2) ? static_cast(in1) : resT(0); - } - else if constexpr (std::is_integral_v || - std::is_integral_v) { + if constexpr (std::is_integral_v || std::is_integral_v) { if (in2 == argT2(0)) { return resT(0); } @@ -87,16 +83,7 @@ struct FloorDivideFunctor operator()(const sycl::vec &in1, const sycl::vec &in2) const { - if constexpr (std::is_same_v && - std::is_same_v) { - sycl::vec res; -#pragma unroll - for (int i = 0; i < vec_sz; ++i) { - res[i] = (in2[i]) ? static_cast(in1[i]) : resT(0); - } - return res; - } - else if constexpr (std::is_integral_v) { + if constexpr (std::is_integral_v) { sycl::vec res; #pragma unroll for (int i = 0; i < vec_sz; ++i) { @@ -165,7 +152,6 @@ template struct FloorDivideOutputType { using value_type = typename std::disjunction< // disjunction is C++17 // feature, supported by DPC++ - td_ns::BinaryTypeMapResultEntry, td_ns::BinaryTypeMapResultEntry struct FloorDivideInplaceFunctor +{ + using supports_sg_loadstore = std::true_type; + using supports_vec = std::true_type; + + void operator()(resT &in1, const argT &in2) const + { + if constexpr (std::is_integral_v) { + if (in2 == argT(0)) { + in1 = 0; + return; + } + if constexpr (std::is_signed_v) { + auto tmp = in1; + in1 /= in2; + auto mod = tmp % in2; + auto corr = (mod != 0 && l_xor(mod < 0, in2 < 0)); + in1 -= corr; + } + else { + in1 /= in2; + } + } + else { + in1 /= in2; + if (in1 == resT(0)) { + return; + } + in1 = std::floor(in1); + } + } + + template + void operator()(sycl::vec &in1, + const sycl::vec &in2) const + { + if constexpr (std::is_integral_v) { +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + if (in2[i] == argT(0)) { + in1[i] = 0; + } + else { + if constexpr (std::is_signed_v) { + auto tmp = in1[i]; + in1[i] /= in2[i]; + auto mod = tmp % in2[i]; + auto corr = (mod != 0 && l_xor(mod < 0, in2[i] < 0)); + in1[i] -= corr; + } + else { + in1[i] /= in2[i]; + } + } + } + } + else { + in1 /= in2; +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + if (in2[i] != argT(0)) { + in1[i] = std::floor(in1[i]); + } + } + } + } + +private: + bool l_xor(bool b1, bool b2) const + { + return (b1 != b2); + } +}; + +template +using FloorDivideInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + FloorDivideInplaceFunctor, + vec_sz, + n_vecs>; + +template +using FloorDivideInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + FloorDivideInplaceFunctor>; + +template +class floor_divide_inplace_contig_kernel; + +template +sycl::event +floor_divide_inplace_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, FloorDivideInplaceContigFunctor, + floor_divide_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset, + res_p, res_offset, depends); +} + +template +struct FloorDivideInplaceContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename FloorDivideOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = floor_divide_inplace_contig_impl; + return fn; + } + } +}; + +template +class floor_divide_inplace_strided_kernel; + +template +sycl::event floor_divide_inplace_strided_impl( + sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, FloorDivideInplaceStridedFunctor, + floor_divide_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct FloorDivideInplaceStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename FloorDivideOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = floor_divide_inplace_strided_impl; + return fn; + } + } +}; + } // namespace floor_divide } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp index 90b7997a37..6a187da6f4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp @@ -31,6 +31,7 @@ #include #include +#include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" #include "utils/type_dispatch.hpp" #include "utils/type_utils.hpp" @@ -61,7 +62,8 @@ template struct LogAddExpFunctor resT operator()(const argT1 &in1, const argT2 &in2) const { - return impl(in1, in2); + using dpctl::tensor::math_utils::logaddexp; + return logaddexp(in1, in2); } template @@ -79,7 +81,8 @@ template struct LogAddExpFunctor impl_finite(-std::abs(diff[i])); } else { - res[i] = impl(in1[i], in2[i]); + using dpctl::tensor::math_utils::logaddexp; + res[i] = logaddexp(in1[i], in2[i]); } } @@ -87,26 +90,6 @@ template struct LogAddExpFunctor } private: - template T impl(T const &in1, T const &in2) const - { - if (in1 == in2) { // handle signed infinities - const T log2 = std::log(T(2)); - return in1 + log2; - } - else { - const T tmp = in1 - in2; - if (tmp > 0) { - return in1 + std::log1p(std::exp(-tmp)); - } - else if (tmp <= 0) { - return in2 + std::log1p(std::exp(tmp)); - } - else { - return std::numeric_limits::quiet_NaN(); - } - } - } - template T impl_finite(T const &in) const { return (in > 0) ? (in + std::log1p(std::exp(-in))) diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp index ba9241b8db..6654bae384 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp @@ -35,6 +35,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl @@ -55,31 +56,30 @@ template struct PowFunctor using supports_sg_loadstore = std::negation< std::disjunction, tu_ns::is_complex>>; - using supports_vec = - std::negation, - tu_ns::is_complex, - std::is_integral, - std::is_integral>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; - resT operator()(argT1 in1, argT2 in2) const + resT operator()(const argT1 &in1, const argT2 &in2) const { if constexpr (std::is_integral_v || std::is_integral_v) { + auto tmp1 = in1; + auto tmp2 = in2; if constexpr (std::is_signed_v) { - if (in2 < 0) { + if (tmp2 < 0) { // invalid; return 0 return resT(0); } } resT res = 1; - if (in1 == 1 || in2 == 0) { + if (tmp1 == 1 || tmp2 == 0) { return res; } - while (in2 > 0) { - if (in2 & 1) { - res *= in1; + while (tmp2 > 0) { + if (tmp2 & 1) { + res *= tmp1; } - in2 >>= 1; - in1 *= in1; + tmp2 >>= 1; + tmp1 *= tmp1; } return res; } @@ -93,16 +93,48 @@ template struct PowFunctor operator()(const sycl::vec &in1, const sycl::vec &in2) const { - auto res = sycl::pow(in1, in2); - if constexpr (std::is_same_v) { + if constexpr (std::is_integral_v || std::is_integral_v) { + sycl::vec res; +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + auto tmp1 = in1[i]; + auto tmp2 = in2[i]; + if constexpr (std::is_signed_v) { + if (tmp2 < 0) { + // invalid; yield 0 + res[i] = 0; + continue; + } + } + resT res_tmp = 1; + if (tmp1 == 1 || tmp2 == 0) { + res[i] = res_tmp; + continue; + } + while (tmp2 > 0) { + if (tmp2 & 1) { + res_tmp *= tmp1; + } + tmp2 >>= 1; + tmp1 *= tmp1; + } + res[i] = res_tmp; + } return res; } else { - using dpctl::tensor::type_utils::vec_cast; + auto res = sycl::pow(in1, in2); + if constexpr (std::is_same_v) + { + return res; + } + else { + using dpctl::tensor::type_utils::vec_cast; - return vec_cast( - res); + return vec_cast(res); + } } } }; @@ -128,10 +160,6 @@ using PowStridedFunctor = IndexerT, PowFunctor>; -// TODO: when type promotion logic is better defined, -// consider implementing overloads of std::pow that take -// integers for the exponents. Seem to give better accuracy in -// some cases (complex data especially) template struct PowOutputType { using value_type = typename std::disjunction< // disjunction is C++17 @@ -286,6 +314,184 @@ template struct PowStridedFactory } }; +template struct PowInplaceFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + void operator()(resT &res, const argT &in) + { + if constexpr (std::is_integral_v || std::is_integral_v) { + auto tmp1 = res; + auto tmp2 = in; + if constexpr (std::is_signed_v) { + if (tmp2 < 0) { + // invalid; return 0 + res = 0; + return; + } + } + if (tmp1 == 1) { + return; + } + if (tmp2 == 0) { + res = 1; + return; + } + resT res_tmp = 1; + while (tmp2 > 0) { + if (tmp2 & 1) { + res_tmp *= tmp1; + } + tmp2 >>= 1; + tmp1 *= tmp1; + } + res = res_tmp; + return; + } + else { + res = std::pow(res, in); + }; + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) + { + if constexpr (std::is_integral_v || std::is_integral_v) { +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + auto tmp1 = res[i]; + auto tmp2 = in[i]; + if constexpr (std::is_signed_v) { + if (tmp2 < 0) { + // invalid; return 0 + res[i] = 0; + continue; + } + } + if (tmp1 == 1) { + continue; + } + if (tmp2 == 0) { + res[i] = 1; + continue; + } + resT res_tmp = 1; + while (tmp2 > 0) { + if (tmp2 & 1) { + res_tmp *= tmp1; + } + tmp2 >>= 1; + tmp1 *= tmp1; + } + res[i] = res_tmp; + } + } + else { + res = sycl::pow(res, in); + } + } +}; + +template +using PowInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + PowInplaceFunctor, + vec_sz, + n_vecs>; + +template +using PowInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + PowInplaceFunctor>; + +template +class pow_inplace_contig_kernel; + +template +sycl::event +pow_inplace_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, PowInplaceContigFunctor, pow_inplace_contig_kernel>( + exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends); +} + +template struct PowInplaceContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = pow_inplace_contig_impl; + return fn; + } + } +}; + +template +class pow_inplace_strided_kernel; + +template +sycl::event +pow_inplace_strided_impl(sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, PowInplaceStridedFunctor, pow_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct PowInplaceStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = pow_inplace_strided_impl; + return fn; + } + } +}; + } // namespace pow } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp index 6cd306a900..051a1f9029 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp @@ -35,6 +35,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl @@ -313,6 +314,194 @@ template struct RemainderStridedFactory } }; +template struct RemainderInplaceFunctor +{ + + using supports_sg_loadstore = std::true_type; + using supports_vec = std::true_type; + + // functor is only well-defined when argT and resT are the same + static_assert(std::is_same_v); + + void operator()(resT &res, const argT &in) + { + if constexpr (std::is_integral_v || std::is_integral_v) { + if (in == argT(0)) { + res = 0; + return; + } + if constexpr (std::is_signed_v || std::is_signed_v) { + auto tmp = res; + res %= in; + if (res != resT(0) && l_xor(tmp < 0, in < 0)) { + res += in; + } + } + else { + res %= in; + } + } + else { + res = sycl::fmod(res, in); + if (res) { + if (l_xor(in < 0, res < 0)) { + res += in; + } + } + else { + res = sycl::copysign(resT(0), in); + } + } + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) + { + if constexpr (std::is_integral_v || std::is_integral_v) { +#pragma unroll + for (auto i = 0; i < vec_sz; ++i) { + if (in[i] == argT(0)) { + res[i] = 0; + } + else { + auto rem = res[i] % in[i]; + if constexpr (std::is_signed_v || + std::is_signed_v) { + if (rem != 0 && l_xor(res[i] < 0, in[i] < 0)) { + rem += in[i]; + } + } + res[i] = rem; + } + } + } + else { + res = sycl::fmod(res, in); +#pragma unroll + for (auto i = 0; i < vec_sz; ++i) { + if (res[i]) { + if (l_xor(in[i] < 0, res[i] < 0)) { + res[i] += in[i]; + } + } + else { + res[i] = sycl::copysign(resT(0), in[i]); + } + } + } + } + +private: + bool l_xor(bool b1, bool b2) const + { + return (b1 != b2); + } +}; + +template +using RemainderInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + RemainderInplaceFunctor, + vec_sz, + n_vecs>; + +template +using RemainderInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + RemainderInplaceFunctor>; + +template +class remainder_inplace_contig_kernel; + +template +sycl::event +remainder_inplace_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, RemainderInplaceContigFunctor, + remainder_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset, + res_p, res_offset, depends); +} + +template +struct RemainderInplaceContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename RemainderOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = remainder_inplace_contig_impl; + return fn; + } + } +}; + +template +class remainder_inplace_strided_kernel; + +template +sycl::event remainder_inplace_strided_impl( + sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, RemainderInplaceStridedFunctor, + remainder_inplace_strided_kernel>(exec_q, nelems, nd, shape_and_strides, + arg_p, arg_offset, res_p, res_offset, + depends, additional_depends); +} + +template +struct RemainderInplaceStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename RemainderOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = remainder_inplace_strided_impl; + return fn; + } + } +}; + } // namespace remainder } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp new file mode 100644 index 0000000000..de51b31c30 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp @@ -0,0 +1,179 @@ +//=== rsqrt.hpp - Unary function RSQRT ------ +//*-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of RSQRT(x) +/// function that computes the reciprocal square root. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace rsqrt +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +template struct RsqrtFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::true_type; + + resT operator()(const argT &in) const + { + return sycl::rsqrt(in); + } +}; + +template +using RsqrtContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs>; + +template +using RsqrtStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template struct RsqrtOutputType +{ + using value_type = typename std::disjunction< // disjunction is C++17 + // feature, supported by DPC++ + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; +}; + +template +class rsqrt_contig_kernel; + +template +sycl::event rsqrt_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + return elementwise_common::unary_contig_impl< + argTy, RsqrtOutputType, RsqrtContigFunctor, rsqrt_contig_kernel>( + exec_q, nelems, arg_p, res_p, depends); +} + +template struct RsqrtContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = rsqrt_contig_impl; + return fn; + } + } +}; + +template struct RsqrtTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::rsqrt(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename RsqrtOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template class rsqrt_strided_kernel; + +template +sycl::event +rsqrt_strided_impl(sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, RsqrtOutputType, RsqrtStridedFunctor, rsqrt_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template struct RsqrtStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = rsqrt_strided_impl; + return fn; + } + } +}; + +} // namespace rsqrt +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp index 3eb8420933..e4ae857738 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp @@ -34,6 +34,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp index 9f488e6598..86fb0ca2e2 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp @@ -34,6 +34,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl @@ -370,6 +371,233 @@ struct TrueDivideContigRowContigMatrixBroadcastFactory } }; +template struct TrueDivideInplaceFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + void operator()(resT &res, const argT &in) + { + res /= in; + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) + { + res /= in; + } +}; + +// cannot use the out of place table, as it permits real lhs and complex rhs +// T1 corresponds to the type of the rhs, while T2 corresponds to the lhs +// the type of the result must be the same as T2 +template struct TrueDivideInplaceOutputType +{ + using value_type = typename std::disjunction< // disjunction is C++17 + // feature, supported by DPC++ + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + std::complex>, + td_ns::DefaultResultEntry>::result_type; +}; + +template +struct TrueDivideInplaceTypeMapFactory +{ + /*! @brief get typeid for output type of divide(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename TrueDivideInplaceOutputType::value_type; + static_assert(std::is_same_v || std::is_same_v); + return td_ns::GetTypeid{}.get(); + } +}; + +template +using TrueDivideInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + TrueDivideInplaceFunctor, + vec_sz, + n_vecs>; + +template +using TrueDivideInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + TrueDivideInplaceFunctor>; + +template +class true_divide_inplace_contig_kernel; + +template +sycl::event +true_divide_inplace_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, TrueDivideInplaceContigFunctor, + true_divide_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset, + res_p, res_offset, depends); +} + +template +struct TrueDivideInplaceContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = true_divide_inplace_contig_impl; + return fn; + } + } +}; + +template +class true_divide_inplace_strided_kernel; + +template +sycl::event true_divide_inplace_strided_impl( + sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, TrueDivideInplaceStridedFunctor, + true_divide_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct TrueDivideInplaceStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = true_divide_inplace_strided_impl; + return fn; + } + } +}; + +template +class true_divide_inplace_row_matrix_broadcast_sg_krn; + +template +using TrueDivideInplaceRowMatrixBroadcastingFunctor = + elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor< + argT, + resT, + TrueDivideInplaceFunctor>; + +template +sycl::event true_divide_inplace_row_matrix_broadcast_impl( + sycl::queue &exec_q, + std::vector &host_tasks, + size_t n0, + size_t n1, + const char *vec_p, // typeless pointer to (n1,) contiguous row + py::ssize_t vec_offset, + char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + py::ssize_t mat_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_row_matrix_broadcast_impl< + argT, resT, TrueDivideInplaceRowMatrixBroadcastingFunctor, + true_divide_inplace_row_matrix_broadcast_sg_krn>( + exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset, + depends); +} + +template +struct TrueDivideInplaceRowMatrixBroadcastFactory +{ + fnT get() + { + using resT = typename TrueDivideInplaceOutputType::value_type; + if constexpr (!std::is_same_v) { + fnT fn = nullptr; + return fn; + } + else { + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = true_divide_inplace_row_matrix_broadcast_impl; + return fn; + } + } + } +}; + } // namespace true_divide } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index 7dfc956492..adbf96be10 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -24,6 +24,7 @@ #pragma once #include +#include #include #include #include @@ -32,6 +33,7 @@ #include #include "pybind11/pybind11.h" +#include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" #include "utils/sycl_utils.hpp" #include "utils/type_dispatch.hpp" @@ -39,6 +41,7 @@ namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; +namespace su_ns = dpctl::tensor::sycl_utils; namespace dpctl { @@ -47,6 +50,20 @@ namespace tensor namespace kernels { +template struct needs_workaround +{ + static constexpr bool value = + std::is_same_v> && + (std::is_same_v || std::is_same_v); +}; + +template struct can_use_reduce_over_group +{ + static constexpr bool value = + sycl::has_known_identity::value && + !needs_workaround::value; +}; + template (inp_[inp_offset]); + red_val = reduction_op_(red_val, val); } out_[out_iter_offset] = red_val; @@ -153,7 +172,7 @@ struct ReductionOverGroupWithAtomicFunctor const size_t reduction_lid = it.get_local_id(0); const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg - // work-items sums over input with indices + // work-items operate over input with indices // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg // + reduction_lid // for 0 <= m < reductions_per_wi @@ -191,11 +210,17 @@ struct ReductionOverGroupWithAtomicFunctor sycl::memory_scope::device, sycl::access::address_space::global_space> res_ref(out_[out_iter_offset]); - if constexpr (std::is_same_v> || - std::is_same_v>) - { + if constexpr (su_ns::IsPlus::value) { res_ref += red_val_over_wg; } + else if constexpr (std::is_same_v>) + { + res_ref.fetch_max(red_val_over_wg); + } + else if constexpr (std::is_same_v>) + { + res_ref.fetch_min(red_val_over_wg); + } else { outT read_val = res_ref.load(); outT new_val{}; @@ -207,7 +232,103 @@ struct ReductionOverGroupWithAtomicFunctor } }; -typedef sycl::event (*sum_reduction_strided_impl_fn_ptr)( +/* === Reduction, using custom_reduce_over_group, and sycl::atomic_ref === */ + +template +struct CustomReductionOverGroupWithAtomicFunctor +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + outT identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + SlmT local_mem_; + size_t reduction_max_gid_ = 0; + size_t iter_gws_ = 1; + size_t reductions_per_wi = 16; + +public: + CustomReductionOverGroupWithAtomicFunctor( + const argT *data, + outT *res, + ReductionOp reduction_op, + const outT &identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + SlmT local_mem, + size_t reduction_size, + size_t iteration_size, + size_t reduction_size_per_wi) + : inp_(data), out_(res), reduction_op_(reduction_op), + identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + local_mem_(local_mem), reduction_max_gid_(reduction_size), + iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const size_t iter_gid = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + + const size_t reduction_lid = it.get_local_id(0); + const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + + // work-items operate over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + outT local_red_val(identity_); + size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + size_t arg_reduce_gid_max = std::min( + reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg); + + for (size_t arg_reduce_gid = arg_reduce_gid0; + arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) + { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + using dpctl::tensor::type_utils::convert_impl; + outT val = convert_impl(inp_[inp_offset]); + + local_red_val = reduction_op_(local_red_val, val); + } + + auto work_group = it.get_group(); + outT red_val_over_wg = su_ns::custom_reduce_over_group( + work_group, local_mem_, local_red_val, reduction_op_); + + if (work_group.leader()) { + sycl::atomic_ref + res_ref(out_[out_iter_offset]); + outT read_val = res_ref.load(); + outT new_val{}; + do { + new_val = reduction_op_(read_val, red_val_over_wg); + } while (!res_ref.compare_exchange_strong(read_val, new_val)); + } + } +}; + +typedef sycl::event (*reduction_strided_impl_fn_ptr)( sycl::queue &, size_t, size_t, @@ -223,27 +344,51 @@ typedef sycl::event (*sum_reduction_strided_impl_fn_ptr)( const std::vector &); template -class sum_reduction_over_group_with_atomics_krn; +class reduction_over_group_with_atomics_krn; -template -class sum_reduction_over_group_with_atomics_init_krn; +template +class custom_reduction_over_group_with_atomics_krn; + +template +class reduction_over_group_with_atomics_init_krn; template -class sum_reduction_seq_strided_krn; +class reduction_seq_strided_krn; template -class sum_reduction_seq_contig_krn; +class reduction_seq_contig_krn; template -class sum_reduction_axis0_over_group_with_atomics_contig_krn; +class reduction_axis0_over_group_with_atomics_contig_krn; + +template +class custom_reduction_axis0_over_group_with_atomics_contig_krn; template -class sum_reduction_axis1_over_group_with_atomics_contig_krn; +class reduction_axis1_over_group_with_atomics_contig_krn; + +template +class custom_reduction_axis1_over_group_with_atomics_contig_krn; using dpctl::tensor::sycl_utils::choose_workgroup_size; -template -sycl::event sum_reduction_over_group_with_atomics_strided_impl( +template +sycl::event reduction_over_group_with_atomics_strided_impl( sycl::queue &exec_q, size_t iter_nelems, // number of reductions (num. of rows in a matrix // when reducing over rows) @@ -263,8 +408,7 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl( const argTy *arg_tp = reinterpret_cast(arg_cp); resTy *res_tp = reinterpret_cast(res_cp); - using ReductionOpT = sycl::plus; - constexpr resTy identity_val = resTy{0}; + constexpr resTy identity_val = su_ns::Identity::value; const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); @@ -285,7 +429,7 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl( ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, reduction_shape_stride}; - cgh.parallel_for>( sycl::range<1>(iter_nelems), @@ -308,8 +452,8 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl( IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, res_strides); using InitKernelName = - class sum_reduction_over_group_with_atomics_init_krn; + class reduction_over_group_with_atomics_init_krn; cgh.depends_on(depends); cgh.parallel_for( @@ -333,11 +477,11 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl( ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, reduction_shape_stride}; - constexpr size_t preferrered_reductions_per_wi = 4; + constexpr size_t preferred_reductions_per_wi = 8; size_t reductions_per_wi = - (reduction_nelems < preferrered_reductions_per_wi * wg) + (reduction_nelems < preferred_reductions_per_wi * wg) ? std::max(1, (reduction_nelems + wg - 1) / wg) - : preferrered_reductions_per_wi; + : preferred_reductions_per_wi; size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / @@ -347,18 +491,38 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - using KernelName = class sum_reduction_over_group_with_atomics_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class reduction_over_group_with_atomics_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupWithAtomicFunctor( - arg_tp, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, reductions_per_wi)); + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class custom_reduction_over_group_with_atomics_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } }); return comp_ev; @@ -367,7 +531,7 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl( // Contig -typedef sycl::event (*sum_reduction_contig_impl_fn_ptr)( +typedef sycl::event (*reduction_contig_impl_fn_ptr)( sycl::queue &, size_t, size_t, @@ -379,8 +543,8 @@ typedef sycl::event (*sum_reduction_contig_impl_fn_ptr)( const std::vector &); /* @brief Reduce rows in a matrix */ -template -sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl( +template +sycl::event reduction_axis1_over_group_with_atomics_contig_impl( sycl::queue &exec_q, size_t iter_nelems, // number of reductions (num. of rows in a matrix // when reducing over rows) @@ -397,8 +561,7 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl( iter_arg_offset + reduction_arg_offset; resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; - using ReductionOpT = sycl::plus; - constexpr resTy identity_val = resTy{0}; + constexpr resTy identity_val = su_ns::Identity::value; const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); @@ -422,7 +585,7 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl( NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{}; - cgh.parallel_for>( sycl::range<1>(iter_nelems), @@ -456,11 +619,11 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl( result_indexer}; ReductionIndexerT reduction_indexer{}; - constexpr size_t preferrered_reductions_per_wi = 8; + constexpr size_t preferred_reductions_per_wi = 8; size_t reductions_per_wi = - (reduction_nelems < preferrered_reductions_per_wi * wg) + (reduction_nelems < preferred_reductions_per_wi * wg) ? std::max(1, (reduction_nelems + wg - 1) / wg) - : preferrered_reductions_per_wi; + : preferred_reductions_per_wi; size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / @@ -470,28 +633,47 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - using KernelName = - class sum_reduction_axis1_over_group_with_atomics_contig_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = + class reduction_axis1_over_group_with_atomics_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupWithAtomicFunctor( - arg_tp, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, reductions_per_wi)); + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = class + custom_reduction_axis1_over_group_with_atomics_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } }); - return comp_ev; } } /* @brief Reduce rows in a matrix */ -template -sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl( +template +sycl::event reduction_axis0_over_group_with_atomics_contig_impl( sycl::queue &exec_q, size_t iter_nelems, // number of reductions (num. of cols in a matrix // when reducing over cols) @@ -508,14 +690,47 @@ sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl( iter_arg_offset + reduction_arg_offset; resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; - using ReductionOpT = sycl::plus; - constexpr resTy identity_val = resTy{0}; + constexpr resTy identity_val = su_ns::Identity::value; const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - { + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{ + 0, static_cast(reduction_nelems), + static_cast(iter_nelems)}; + + using KernelName = + class reduction_seq_contig_krn; + + sycl::range<1> iter_range{iter_nelems}; + + cgh.parallel_for( + iter_range, + SequentialReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems)); + }); + + return comp_ev; + } + else { sycl::event res_init_ev = exec_q.fill( res_tp, resTy(identity_val), iter_nelems, depends); @@ -537,11 +752,11 @@ sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl( 0, /* size */ static_cast(reduction_nelems), /* step */ static_cast(iter_nelems)}; - constexpr size_t preferrered_reductions_per_wi = 8; + constexpr size_t preferred_reductions_per_wi = 8; size_t reductions_per_wi = - (reduction_nelems < preferrered_reductions_per_wi * wg) + (reduction_nelems < preferred_reductions_per_wi * wg) ? std::max(1, (reduction_nelems + wg - 1) / wg) - : preferrered_reductions_per_wi; + : preferred_reductions_per_wi; size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / @@ -551,21 +766,40 @@ sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - using KernelName = - class sum_reduction_axis0_over_group_with_atomics_contig_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = + class reduction_axis0_over_group_with_atomics_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupWithAtomicFunctor( - arg_tp, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, reductions_per_wi)); + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = class + custom_reduction_axis0_over_group_with_atomics_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } }); - return comp_ev; } } @@ -618,7 +852,7 @@ struct ReductionOverGroupNoAtomicFunctor const size_t reduction_batch_id = it.get_group(0) / iter_gws_; const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; - // work-items sums over input with indices + // work-items operates over input with indices // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg // + reduction_lid // for 0 <= m < reductions_per_wi @@ -658,95 +892,4368 @@ struct ReductionOverGroupNoAtomicFunctor } }; -template -class sum_reduction_over_group_temps_krn; +/* = Reduction, using custom_reduce_over_group and not using atomic_ref*/ -template -sycl::event sum_reduction_over_group_temps_strided_impl( - sycl::queue &exec_q, - size_t iter_nelems, // number of reductions (num. of rows in a matrix - // when reducing over rows) - size_t reduction_nelems, // size of each reduction (length of rows, i.e. - // number of columns) - const char *arg_cp, - char *res_cp, - int iter_nd, - const py::ssize_t *iter_shape_and_strides, - py::ssize_t iter_arg_offset, - py::ssize_t iter_res_offset, - int red_nd, - const py::ssize_t *reduction_shape_stride, - py::ssize_t reduction_arg_offset, - const std::vector &depends) +template +struct CustomReductionOverGroupNoAtomicFunctor { - const argTy *arg_tp = reinterpret_cast(arg_cp); - resTy *res_tp = reinterpret_cast(res_cp); - - using ReductionOpT = sycl::plus; - constexpr resTy identity_val = resTy{0}; +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + outT identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + SlmT local_mem_; + size_t reduction_max_gid_ = 0; + size_t iter_gws_ = 1; + size_t reductions_per_wi = 16; - const sycl::device &d = exec_q.get_device(); - const auto &sg_sizes = d.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); +public: + CustomReductionOverGroupNoAtomicFunctor( + const argT *data, + outT *res, + ReductionOp reduction_op, + const outT &identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + SlmT local_mem, + size_t reduction_size, + size_t iteration_size, + size_t reduction_size_per_wi) + : inp_(data), out_(res), reduction_op_(reduction_op), + identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + local_mem_(local_mem), reduction_max_gid_(reduction_size), + iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi) + { + } - constexpr size_t preferrered_reductions_per_wi = 4; - size_t max_wg = d.get_info(); + void operator()(sycl::nd_item<1> it) const + { + const size_t reduction_lid = it.get_local_id(0); + const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg - size_t reductions_per_wi(preferrered_reductions_per_wi); - if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { - // reduction only requires 1 work-group, can output directly to res - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); + const size_t iter_gid = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; - using InputOutputIterIndexerT = - dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; - using ReductionIndexerT = - dpctl::tensor::offset_utils::StridedIndexer; + // work-items operates over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi - InputOutputIterIndexerT in_out_iter_indexer{ - iter_nd, iter_arg_offset, iter_res_offset, - iter_shape_and_strides}; - ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, - reduction_shape_stride}; + auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); - wg = max_wg; - reductions_per_wi = - std::max(1, (reduction_nelems + wg - 1) / wg); + outT local_red_val(identity_); + size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (size_t m = 0; m < reductions_per_wi; ++m) { + size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; - size_t reduction_groups = - (reduction_nelems + reductions_per_wi * wg - 1) / - (reductions_per_wi * wg); - assert(reduction_groups == 1); + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; - auto globalRange = - sycl::range<1>{iter_nelems * reduction_groups * wg}; - auto localRange = sycl::range<1>{wg}; + using dpctl::tensor::type_utils::convert_impl; + outT val = convert_impl(inp_[inp_offset]); - using KernelName = class sum_reduction_over_group_temps_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupNoAtomicFunctor( - arg_tp, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, reductions_per_wi)); - }); + local_red_val = reduction_op_(local_red_val, val); + } + } + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + outT red_val_over_wg = su_ns::custom_reduce_over_group( + work_group, local_mem_, local_red_val, reduction_op_); + + if (work_group.leader()) { + // each group writes to a different memory location + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + red_val_over_wg; + } + } +}; + +typedef sycl::event (*reduction_strided_impl_fn_ptr)( + sycl::queue &, + size_t, + size_t, + const char *, + char *, + int, + const py::ssize_t *, + py::ssize_t, + py::ssize_t, + int, + const py::ssize_t *, + py::ssize_t, + const std::vector &); + +template +class reduction_over_group_temps_strided_krn; + +template +class custom_reduction_over_group_temps_strided_krn; + +template +class reduction_over_group_temps_empty_krn; + +template +class single_reduction_axis0_temps_contig_krn; + +template +class first_reduction_axis0_temps_contig_krn; + +template +class middle_reduction_axis0_temps_contig_krn; + +template +class final_reduction_axis0_temps_contig_krn; + +template +class single_custom_reduction_axis0_temps_contig_krn; + +template +class first_custom_reduction_axis0_temps_contig_krn; + +template +class middle_custom_reduction_axis0_temps_contig_krn; + +template +class final_custom_reduction_axis0_temps_contig_krn; + +template +class single_reduction_axis1_temps_contig_krn; + +template +class first_reduction_axis1_temps_contig_krn; + +template +class middle_reduction_axis1_temps_contig_krn; + +template +class final_reduction_axis1_temps_contig_krn; + +template +class single_custom_reduction_axis1_temps_contig_krn; + +template +class first_custom_reduction_axis1_temps_contig_krn; + +template +class middle_custom_reduction_axis1_temps_contig_krn; + +template +class final_custom_reduction_axis1_temps_contig_krn; + +template +sycl::event reduction_over_group_temps_strided_impl( + sycl::queue &exec_q, + size_t iter_nelems, // number of reductions (num. of rows in a matrix + // when reducing over rows) + size_t reduction_nelems, // size of each reduction (length of rows, i.e. + // number of columns) + const char *arg_cp, + char *res_cp, + int iter_nd, + const py::ssize_t *iter_shape_and_strides, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + int red_nd, + const py::ssize_t *reduction_shape_stride, + py::ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + constexpr resTy identity_val = su_ns::Identity::value; + + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + using IndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + + const py::ssize_t *const &res_shape = iter_shape_and_strides; + const py::ssize_t *const &res_strides = + iter_shape_and_strides + 2 * iter_nd; + IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, + res_strides); + using InitKernelName = + class reduction_over_group_temps_empty_krn; + cgh.depends_on(depends); + + cgh.parallel_for( + sycl::range<1>(iter_nelems), [=](sycl::id<1> id) { + auto res_offset = res_indexer(id[0]); + res_tp[res_offset] = identity_val; + }); + }); + + return res_init_ev; + } + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, + iter_shape_and_strides}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + cgh.parallel_for>( + sycl::range<1>(iter_nelems), + SequentialReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems)); + }); + + return comp_ev; + } + + constexpr size_t preferred_reductions_per_wi = 8; + // max_max_wg prevents running out of resources on CPU + constexpr size_t max_max_wg = 2048; + size_t max_wg = std::min( + max_max_wg, d.get_info() / 2); + + size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, + iter_shape_and_strides}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class reduction_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class custom_reduction_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + }); + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups > 1); + + size_t second_iter_reduction_groups_ = + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + resTy *partially_reduced_tmp = sycl::malloc_device( + iter_nelems * (reduction_groups + second_iter_reduction_groups_), + exec_q); + resTy *partially_reduced_tmp2 = nullptr; + + if (partially_reduced_tmp == nullptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + else { + partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + } + + const sycl::event &first_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(depends); + + using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + // Only 2*iter_nd entries describing shape and strides of + // iterated dimensions of input array from + // iter_shape_and_strides are going to be accessed by + // inp_indexer + InputIndexerT inp_indexer(iter_nd, iter_arg_offset, + iter_shape_and_strides); + ResIndexerT noop_tmp_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + noop_tmp_indexer}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class reduction_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>( + arg_tp, partially_reduced_tmp, ReductionOpT(), + identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems, iter_nelems, + preferred_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class custom_reduction_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, partially_reduced_tmp, ReductionOpT(), + identity_val, in_out_iter_indexer, reduction_indexer, + local_memory, reduction_nelems, iter_nelems, + preferred_reductions_per_wi)); + } + }); + + size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferred_reductions_per_wi * max_wg) { + size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; + ResIndexerT res_iter_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups_ * wg}; + auto localRange = sycl::range<1>{wg}; + if constexpr (can_use_reduce_over_group::value) { + using KernelName = + class reduction_over_group_temps_strided_krn< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>( + temp_arg, temp2_arg, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, + remaining_reduction_nelems, iter_nelems, + preferred_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class custom_reduction_over_group_temps_strided_krn< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + temp_arg, temp2_arg, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, + local_memory, remaining_reduction_nelems, + iter_nelems, preferred_reductions_per_wi)); + } + }); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + dependent_ev = std::move(partial_reduction_ev); + } + + // final reduction to res + sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; + ResIndexerT res_iter_indexer{iter_nd, iter_res_offset, + /* shape */ iter_shape_and_strides, + /* strides */ iter_shape_and_strides + + 2 * iter_nd}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = + std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class reduction_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(temp_arg, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, + remaining_reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class custom_reduction_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + temp_arg, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + reductions_per_wi)); + } + }); + + sycl::event cleanup_host_task_event = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(final_reduction_ev); + const sycl::context &ctx = exec_q.get_context(); + + cgh.host_task([ctx, partially_reduced_tmp] { + sycl::free(partially_reduced_tmp, ctx); + }); + }); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +template +sycl::event reduction_axis1_over_group_temps_contig_impl( + sycl::queue &exec_q, + size_t iter_nelems, // number of reductions (num. of rows in a matrix + // when reducing over rows) + size_t reduction_nelems, // size of each reduction (length of rows, i.e. + // number of columns) + const char *arg_cp, + char *res_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + py::ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + constexpr resTy identity_val = su_ns::Identity::value; + + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(identity_val), iter_nelems, depends); + + return res_init_ev; + } + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputIterIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{}; + + cgh.parallel_for>( + sycl::range<1>(iter_nelems), + SequentialReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems)); + }); + + return comp_ev; + } + + constexpr size_t preferred_reductions_per_wi = 8; + // max_max_wg prevents running out of resources on CPU + constexpr size_t max_max_wg = 2048; + size_t max_wg = std::min( + max_max_wg, d.get_info() / 2); + + size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputIterIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = + class single_reduction_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class single_custom_reduction_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + }); + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups > 1); + + size_t second_iter_reduction_groups_ = + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + resTy *partially_reduced_tmp = sycl::malloc_device( + iter_nelems * (reduction_groups + second_iter_reduction_groups_), + exec_q); + resTy *partially_reduced_tmp2 = nullptr; + + if (partially_reduced_tmp == nullptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + else { + partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + } + + const sycl::event &first_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(depends); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using RowsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + RowsIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + RowsIndexerT rows_indexer{ + 0, static_cast(iter_nelems), + static_cast(reduction_nelems)}; + NoOpIndexerT noop_tmp_indexer{}; + InputOutputIterIndexerT in_out_iter_indexer{rows_indexer, + noop_tmp_indexer}; + ReductionIndexerT reduction_indexer{}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class first_reduction_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>( + arg_tp, partially_reduced_tmp, ReductionOpT(), + identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems, iter_nelems, + preferred_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class first_custom_reduction_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, partially_reduced_tmp, ReductionOpT(), + identity_val, in_out_iter_indexer, reduction_indexer, + local_memory, reduction_nelems, iter_nelems, + preferred_reductions_per_wi)); + } + }); + + size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferred_reductions_per_wi * max_wg) { + size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; + ResIndexerT res_iter_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups_ * wg}; + auto localRange = sycl::range<1>{wg}; + if constexpr (can_use_reduce_over_group::value) { + using KernelName = + class middle_reduction_axis1_temps_contig_krn< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>( + temp_arg, temp2_arg, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, + remaining_reduction_nelems, iter_nelems, + preferred_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class middle_custom_reduction_axis1_temps_contig_krn< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + temp_arg, temp2_arg, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, + local_memory, remaining_reduction_nelems, + iter_nelems, preferred_reductions_per_wi)); + } + }); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + dependent_ev = std::move(partial_reduction_ev); + } + + // final reduction to res + sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; + ResIndexerT res_iter_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = + std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class final_reduction_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(temp_arg, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, + remaining_reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class final_custom_reduction_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + temp_arg, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + reductions_per_wi)); + } + }); + + sycl::event cleanup_host_task_event = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(final_reduction_ev); + const sycl::context &ctx = exec_q.get_context(); + + cgh.host_task([ctx, partially_reduced_tmp] { + sycl::free(partially_reduced_tmp, ctx); + }); + }); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +template +sycl::event reduction_axis0_over_group_temps_contig_impl( + sycl::queue &exec_q, + size_t iter_nelems, // number of reductions (num. of rows in a matrix + // when reducing over rows) + size_t reduction_nelems, // size of each reduction (length of rows, i.e. + // number of columns) + const char *arg_cp, + char *res_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + py::ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + constexpr resTy identity_val = su_ns::Identity::value; + + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(identity_val), iter_nelems, depends); + + return res_init_ev; + } + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{ + 0, static_cast(reduction_nelems), + static_cast(iter_nelems)}; + + using KernelName = + class reduction_seq_contig_krn; + + sycl::range<1> iter_range{iter_nelems}; + + cgh.parallel_for( + iter_range, + SequentialReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems)); + }); + + return comp_ev; + } + + constexpr size_t preferred_reductions_per_wi = 8; + // max_max_wg prevents running out of resources on CPU + constexpr size_t max_max_wg = 2048; + size_t max_wg = std::min( + max_max_wg, d.get_info() / 2); + + size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; + + NoOpIndexerT columns_indexer{}; + NoOpIndexerT result_indexer{}; + InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + result_indexer}; + ReductionIndexerT reduction_indexer{ + 0, /* size */ static_cast(reduction_nelems), + /* step */ static_cast(iter_nelems)}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = + class single_reduction_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class single_custom_reduction_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + }); + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups > 1); + + size_t second_iter_reduction_groups_ = + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + resTy *partially_reduced_tmp = sycl::malloc_device( + iter_nelems * (reduction_groups + second_iter_reduction_groups_), + exec_q); + resTy *partially_reduced_tmp2 = nullptr; + + if (partially_reduced_tmp == nullptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + else { + partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + } + + const sycl::event &first_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(depends); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; + + NoOpIndexerT columns_indexer{}; + NoOpIndexerT noop_tmp_indexer{}; + InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + noop_tmp_indexer}; + ReductionIndexerT reduction_indexer{ + 0, /* size */ static_cast(reduction_nelems), + /* step */ static_cast(iter_nelems)}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class first_reduction_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>( + arg_tp, partially_reduced_tmp, ReductionOpT(), + identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems, iter_nelems, + preferred_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class first_custom_reduction_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, partially_reduced_tmp, ReductionOpT(), + identity_val, in_out_iter_indexer, reduction_indexer, + local_memory, reduction_nelems, iter_nelems, + preferred_reductions_per_wi)); + } + }); + + size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferred_reductions_per_wi * max_wg) { + size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; + ResIndexerT res_iter_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups_ * wg}; + auto localRange = sycl::range<1>{wg}; + if constexpr (can_use_reduce_over_group::value) { + using KernelName = + class middle_reduction_axis0_temps_contig_krn< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>( + temp_arg, temp2_arg, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, + remaining_reduction_nelems, iter_nelems, + preferred_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class middle_custom_reduction_axis0_temps_contig_krn< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + temp_arg, temp2_arg, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, + local_memory, remaining_reduction_nelems, + iter_nelems, preferred_reductions_per_wi)); + } + }); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + dependent_ev = std::move(partial_reduction_ev); + } + + // final reduction to res + sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; + ResIndexerT res_iter_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = + std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class final_reduction_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(temp_arg, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, + remaining_reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class final_custom_reduction_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + temp_arg, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + reductions_per_wi)); + } + }); + + sycl::event cleanup_host_task_event = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(final_reduction_ev); + const sycl::context &ctx = exec_q.get_context(); + + cgh.host_task([ctx, partially_reduced_tmp] { + sycl::free(partially_reduced_tmp, ctx); + }); + }); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +/* @brief Types supported by comparison-reduction code based on atomic_ref */ +template +struct TypePairSupportDataForCompReductionAtomic +{ + + /* value is true if a kernel for must be instantiated, false + * otherwise */ + // disjunction is C++17 feature, supported by DPC++ + static constexpr bool is_defined = std::disjunction< + // input int32 + td_ns::TypePairDefinedEntry, + // input uint32 + td_ns::TypePairDefinedEntry, + // input int64 + td_ns::TypePairDefinedEntry, + // input uint64 + td_ns::TypePairDefinedEntry, + // input float + td_ns::TypePairDefinedEntry, + // input double + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TypePairSupportDataForCompReductionTemps +{ + + // disjunction is C++17 feature, supported by DPC++ + static constexpr bool is_defined = std::disjunction< + // input bool + td_ns::TypePairDefinedEntry, + // input int8_t + td_ns::TypePairDefinedEntry, + // input uint8_t + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + // input uint16_t + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct MaxOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + } + else { + return nullptr; + } + } +}; + +// Sum + +/* @brief Types supported by plus-reduction code based on atomic_ref */ +template +struct TypePairSupportDataForSumReductionAtomic +{ + + /* value if true a kernel for must be instantiated, false + * otherwise */ + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int64 + td_ns::TypePairDefinedEntry, + // input uint64 + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TypePairSupportDataForSumReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns:: + TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input double + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-throug + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct SumOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +// Product + +/* @brief Types supported by plus-reduction code based on atomic_ref */ +template +struct TypePairSupportDataForProductReductionAtomic +{ + + /* value if true a kernel for must be instantiated, false + * otherwise */ + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int64 + td_ns::TypePairDefinedEntry, + // input uint64 + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TypePairSupportDataForProductReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns:: + TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input double + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-throug + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct ProductOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct TypePairSupportDataForHypotReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct HypotOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForHypotReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = su_ns::Hypot; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct HypotOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForHypotReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = su_ns::Hypot; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct HypotOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForHypotReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = su_ns::Hypot; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct TypePairSupportDataForLogSumExpReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct LogSumExpOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForLogSumExpReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = su_ns::LogSumExp; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct LogSumExpOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForLogSumExpReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = su_ns::LogSumExp; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct LogSumExpOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForLogSumExpReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = su_ns::LogSumExp; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +// Argmax and Argmin + +/* Sequential search reduction */ + +template +struct SequentialSearchReduction +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + argT identity_; + IdxReductionOp idx_reduction_op_; + outT idx_identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + size_t reduction_max_gid_ = 0; + +public: + SequentialSearchReduction(const argT *inp, + outT *res, + ReductionOp reduction_op, + const argT &identity_val, + IdxReductionOp idx_reduction_op, + const outT &idx_identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + size_t reduction_size) + : inp_(inp), out_(res), reduction_op_(reduction_op), + identity_(identity_val), idx_reduction_op_(idx_reduction_op), + idx_identity_(idx_identity_val), + inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size) + { + } + + void operator()(sycl::id<1> id) const + { + + auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]); + const py::ssize_t &inp_iter_offset = + inp_out_iter_offsets_.get_first_offset(); + const py::ssize_t &out_iter_offset = + inp_out_iter_offsets_.get_second_offset(); + + argT red_val(identity_); + outT idx_val(idx_identity_); + for (size_t m = 0; m < reduction_max_gid_; ++m) { + const py::ssize_t inp_reduction_offset = + inp_reduced_dims_indexer_(m); + const py::ssize_t inp_offset = + inp_iter_offset + inp_reduction_offset; + + argT val = inp_[inp_offset]; + if (val == red_val) { + idx_val = idx_reduction_op_(idx_val, static_cast(m)); + } + else { + if constexpr (su_ns::IsMinimum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::less_complex; + // less_complex always returns false for NaNs, so check + if (less_complex(val, red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + red_val = val; + idx_val = static_cast(m); + } + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { + if (val < red_val || std::isnan(val)) { + red_val = val; + idx_val = static_cast(m); + } + } + else { + if (val < red_val) { + red_val = val; + idx_val = static_cast(m); + } + } + } + else if constexpr (su_ns::IsMaximum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::greater_complex; + if (greater_complex(val, red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + red_val = val; + idx_val = static_cast(m); + } + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { + if (val > red_val || std::isnan(val)) { + red_val = val; + idx_val = static_cast(m); + } + } + else { + if (val > red_val) { + red_val = val; + idx_val = static_cast(m); + } + } + } + } + } + out_[out_iter_offset] = idx_val; + } +}; + +/* = Search reduction using reduce_over_group*/ + +template +struct SearchReduction +{ +private: + const argT *inp_ = nullptr; + argT *vals_ = nullptr; + const outT *inds_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + argT identity_; + IdxReductionOp idx_reduction_op_; + outT idx_identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + size_t reduction_max_gid_ = 0; + size_t iter_gws_ = 1; + size_t reductions_per_wi = 16; + +public: + SearchReduction(const argT *data, + argT *vals, + const outT *inds, + outT *res, + ReductionOp reduction_op, + const argT &identity_val, + IdxReductionOp idx_reduction_op, + const outT &idx_identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + size_t reduction_size, + size_t iteration_size, + size_t reduction_size_per_wi) + : inp_(data), vals_(vals), inds_(inds), out_(res), + reduction_op_(reduction_op), identity_(identity_val), + idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val), + inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size), iter_gws_(iteration_size), + reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const size_t reduction_lid = it.get_local_id(0); + const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + + const size_t iter_gid = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; + + // work-items operates over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + argT local_red_val(identity_); + outT local_idx(idx_identity_); + size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (size_t m = 0; m < reductions_per_wi; ++m) { + size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + argT val = inp_[inp_offset]; + if (val == local_red_val) { + if constexpr (!First) { + local_idx = + idx_reduction_op_(local_idx, inds_[inp_offset]); + } + else { + local_idx = idx_reduction_op_( + local_idx, static_cast(arg_reduce_gid)); + } + } + else { + if constexpr (su_ns::IsMinimum::value) { + if (val < local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } + } + } + else if constexpr (su_ns::IsMaximum::value) { + if (val > local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } + } + } + } + } + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + argT red_val_over_wg = sycl::reduce_over_group( + work_group, local_red_val, identity_, reduction_op_); + + if constexpr (std::is_integral_v) { + local_idx = + (red_val_over_wg == local_red_val) ? local_idx : idx_identity_; + } + else { + local_idx = + (red_val_over_wg == local_red_val || + std::isnan(red_val_over_wg) || std::isnan(local_red_val)) + ? local_idx + : idx_identity_; + } + outT idx_over_wg = sycl::reduce_over_group( + work_group, local_idx, idx_identity_, idx_reduction_op_); + + if (work_group.leader()) { + // each group writes to a different memory location + if constexpr (!Last) { + // if not the final reduction, write value corresponding to + // an index to a temporary + vals_[out_iter_offset * n_reduction_groups + + reduction_batch_id] = red_val_over_wg; + } + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + idx_over_wg; + } + } +}; + +/* = Search reduction using custom_reduce_over_group*/ + +template +struct CustomSearchReduction +{ +private: + const argT *inp_ = nullptr; + argT *vals_ = nullptr; + const outT *inds_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + argT identity_; + IdxReductionOp idx_reduction_op_; + outT idx_identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + SlmT local_mem_; + size_t reduction_max_gid_ = 0; + size_t iter_gws_ = 1; + size_t reductions_per_wi = 16; + +public: + CustomSearchReduction(const argT *data, + argT *vals, + outT *inds, + outT *res, + ReductionOp reduction_op, + const argT &identity_val, + IdxReductionOp idx_reduction_op, + const outT &idx_identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + SlmT local_mem, + size_t reduction_size, + size_t iteration_size, + size_t reduction_size_per_wi) + : inp_(data), vals_(vals), inds_(inds), out_(res), + reduction_op_(reduction_op), identity_(identity_val), + idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val), + inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + local_mem_(local_mem), reduction_max_gid_(reduction_size), + iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const size_t reduction_lid = it.get_local_id(0); + const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + + const size_t iter_gid = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; + + // work-items operates over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + argT local_red_val(identity_); + outT local_idx(idx_identity_); + size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (size_t m = 0; m < reductions_per_wi; ++m) { + size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + argT val = inp_[inp_offset]; + if (val == local_red_val) { + if constexpr (!First) { + local_idx = + idx_reduction_op_(local_idx, inds_[inp_offset]); + } + else { + local_idx = idx_reduction_op_( + local_idx, static_cast(arg_reduce_gid)); + } + } + else { + if constexpr (su_ns::IsMinimum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::less_complex; + // less_complex always returns false for NaNs, so + // check + if (less_complex(val, local_red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { + if (val < local_red_val || std::isnan(val)) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else { + if (val < local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + } + else if constexpr (su_ns::IsMaximum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::greater_complex; + if (greater_complex(val, local_red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { + if (val > local_red_val || std::isnan(val)) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else { + if (val > local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + } + } + } + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + argT red_val_over_wg = su_ns::custom_reduce_over_group( + work_group, local_mem_, local_red_val, reduction_op_); + + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + // equality does not hold for NaNs, so check here + local_idx = (red_val_over_wg == local_red_val || + std::isnan(std::real(local_red_val)) || + std::isnan(std::imag(local_red_val))) + ? local_idx + : idx_identity_; + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { + // equality does not hold for NaNs, so check here + local_idx = + (red_val_over_wg == local_red_val || std::isnan(local_red_val)) + ? local_idx + : idx_identity_; + } + else { + local_idx = + red_val_over_wg == local_red_val ? local_idx : idx_identity_; + } + outT idx_over_wg = sycl::reduce_over_group( + work_group, local_idx, idx_identity_, idx_reduction_op_); + if (work_group.leader()) { + // each group writes to a different memory location + if constexpr (!Last) { + // if not the final reduction, write value corresponding to + // an index to a temporary + vals_[out_iter_offset * n_reduction_groups + + reduction_batch_id] = red_val_over_wg; + } + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + idx_over_wg; + } + } +}; + +typedef sycl::event (*search_strided_impl_fn_ptr)( + sycl::queue, + size_t, + size_t, + const char *, + char *, + int, + const py::ssize_t *, + py::ssize_t, + py::ssize_t, + int, + const py::ssize_t *, + py::ssize_t, + const std::vector &); + +template +class search_seq_strided_krn; + +template +class search_over_group_temps_strided_krn; + +template +class custom_search_over_group_temps_strided_krn; + +template class search_empty_krn; + +template +class search_seq_contig_krn; + +template +class single_search_axis0_temps_contig_krn; + +template +class first_search_axis0_temps_contig_krn; + +template +class middle_search_axis0_temps_contig_krn; + +template +class final_search_axis0_temps_contig_krn; + +template +class single_custom_search_axis0_temps_contig_krn; + +template +class first_custom_search_axis0_temps_contig_krn; + +template +class middle_custom_search_axis0_temps_contig_krn; + +template +class final_custom_search_axis0_temps_contig_krn; + +template +class single_search_axis1_temps_contig_krn; + +template +class first_search_axis1_temps_contig_krn; + +template +class middle_search_axis1_temps_contig_krn; + +template +class final_search_axis1_temps_contig_krn; + +template +class single_custom_search_axis1_temps_contig_krn; + +template +class first_custom_search_axis1_temps_contig_krn; + +template +class middle_custom_search_axis1_temps_contig_krn; + +template +class final_custom_search_axis1_temps_contig_krn; + +template +sycl::event search_over_group_temps_strided_impl( + sycl::queue exec_q, + size_t iter_nelems, // number of reductions (num. of rows in a matrix + // when reducing over rows) + size_t reduction_nelems, // size of each reduction (length of rows, i.e. + // number of columns) + const char *arg_cp, + char *res_cp, + int iter_nd, + const py::ssize_t *iter_shape_and_strides, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + int red_nd, + const py::ssize_t *reduction_shape_stride, + py::ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + constexpr argTy identity_val = su_ns::Identity::value; + constexpr resTy idx_identity_val = su_ns::Identity::value; + + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + using IndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + + const py::ssize_t *const &res_shape = iter_shape_and_strides; + const py::ssize_t *const &res_strides = + iter_shape_and_strides + 2 * iter_nd; + IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, + res_strides); + using InitKernelName = + class search_empty_krn; + cgh.depends_on(depends); + + cgh.parallel_for( + sycl::range<1>(iter_nelems), [=](sycl::id<1> id) { + auto res_offset = res_indexer(id[0]); + res_tp[res_offset] = idx_identity_val; + }); + }); + + return res_init_ev; + } + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, + iter_shape_and_strides}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + cgh.parallel_for>( + sycl::range<1>(iter_nelems), + SequentialSearchReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems)); + }); + + return comp_ev; + } + + constexpr size_t preferred_reductions_per_wi = 4; + // max_max_wg prevents running out of resources on CPU + size_t max_wg = + std::min(size_t(2048), + d.get_info() / 2); + + size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, + iter_shape_and_strides}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class search_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, true, true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + arg_tp, nullptr, nullptr, res_tp, ReductionOpT(), + identity_val, IndexOpT(), idx_identity_val, + in_out_iter_indexer, reduction_indexer, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class custom_search_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, true, + true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + arg_tp, nullptr, nullptr, res_tp, ReductionOpT(), + identity_val, IndexOpT(), idx_identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + }); + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups > 1); + + size_t second_iter_reduction_groups_ = + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + resTy *partially_reduced_tmp = sycl::malloc_device( + iter_nelems * (reduction_groups + second_iter_reduction_groups_), + exec_q); + resTy *partially_reduced_tmp2 = nullptr; + + if (partially_reduced_tmp == nullptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + else { + partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + } + + argTy *partially_reduced_vals_tmp = sycl::malloc_device( + iter_nelems * (reduction_groups + second_iter_reduction_groups_), + exec_q); + argTy *partially_reduced_vals_tmp2 = nullptr; + + if (partially_reduced_vals_tmp == nullptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + else { + partially_reduced_vals_tmp2 = + partially_reduced_vals_tmp + reduction_groups * iter_nelems; + } + + sycl::event first_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + // Only 2*iter_nd entries describing shape and strides of iterated + // dimensions of input array from iter_shape_and_strides are going + // to be accessed by inp_indexer + InputIndexerT inp_indexer(iter_nd, iter_arg_offset, + iter_shape_and_strides); + ResIndexerT noop_tmp_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + noop_tmp_indexer}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class search_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, true, false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, ReductionOpT(), identity_val, + IndexOpT(), idx_identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, iter_nelems, + preferred_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class custom_search_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, true, + false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, ReductionOpT(), identity_val, + IndexOpT(), idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, reduction_nelems, + iter_nelems, preferred_reductions_per_wi)); + } + }); + + size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + + argTy *vals_temp_arg = partially_reduced_vals_tmp; + argTy *vals_temp2_arg = partially_reduced_vals_tmp2; + + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferred_reductions_per_wi * max_wg) { + size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; + ResIndexerT res_iter_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups_ * wg}; + auto localRange = sycl::range<1>{wg}; + if constexpr (can_use_reduce_over_group::value) { + using KernelName = + class search_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, false, + false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, remaining_reduction_nelems, + iter_nelems, preferred_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class custom_search_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, + false, false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + preferred_reductions_per_wi)); + } + }); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + std::swap(vals_temp_arg, vals_temp2_arg); + dependent_ev = partial_reduction_ev; + } + + // final reduction to res + sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; + ResIndexerT res_iter_indexer{iter_nd, iter_res_offset, + /* shape */ iter_shape_and_strides, + /* strides */ iter_shape_and_strides + + 2 * iter_nd}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = + std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class search_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, false, true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + vals_temp_arg, nullptr, temp_arg, res_tp, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, remaining_reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class custom_search_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, false, + true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + vals_temp_arg, nullptr, temp_arg, res_tp, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + reductions_per_wi)); + } + }); + + sycl::event cleanup_host_task_event = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(final_reduction_ev); + sycl::context ctx = exec_q.get_context(); + + cgh.host_task( + [ctx, partially_reduced_tmp, partially_reduced_vals_tmp] { + sycl::free(partially_reduced_tmp, ctx); + sycl::free(partially_reduced_vals_tmp, ctx); + }); + }); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +typedef sycl::event (*search_contig_impl_fn_ptr)( + sycl::queue, + size_t, + size_t, + const char *, + char *, + py::ssize_t, + py::ssize_t, + py::ssize_t, + const std::vector &); + +template +sycl::event search_axis1_over_group_temps_contig_impl( + sycl::queue exec_q, + size_t iter_nelems, // number of reductions (num. of rows in a matrix + // when reducing over rows) + size_t reduction_nelems, // size of each reduction (length of rows, i.e. + // number of columns) + const char *arg_cp, + char *res_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + py::ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + constexpr argTy identity_val = su_ns::Identity::value; + constexpr resTy idx_identity_val = su_ns::Identity::value; + + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(idx_identity_val), iter_nelems, depends); + + return res_init_ev; + } + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputIterIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{}; + + cgh.parallel_for>( + sycl::range<1>(iter_nelems), + SequentialSearchReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems)); + }); + + return comp_ev; + } + + constexpr size_t preferred_reductions_per_wi = 8; + // max_max_wg prevents running out of resources on CPU + size_t max_wg = + std::min(size_t(2048), + d.get_info() / 2); + + size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputIterIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class single_search_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, true, true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + arg_tp, nullptr, nullptr, res_tp, ReductionOpT(), + identity_val, IndexOpT(), idx_identity_val, + in_out_iter_indexer, reduction_indexer, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class single_custom_search_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, true, + true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + arg_tp, nullptr, nullptr, res_tp, ReductionOpT(), + identity_val, IndexOpT(), idx_identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + }); + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups > 1); + + size_t second_iter_reduction_groups_ = + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + resTy *partially_reduced_tmp = sycl::malloc_device( + iter_nelems * (reduction_groups + second_iter_reduction_groups_), + exec_q); + resTy *partially_reduced_tmp2 = nullptr; + + if (partially_reduced_tmp == nullptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + else { + partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + } + + argTy *partially_reduced_vals_tmp = sycl::malloc_device( + iter_nelems * (reduction_groups + second_iter_reduction_groups_), + exec_q); + argTy *partially_reduced_vals_tmp2 = nullptr; + + if (partially_reduced_vals_tmp == nullptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + else { + partially_reduced_vals_tmp2 = + partially_reduced_vals_tmp + reduction_groups * iter_nelems; + } + + sycl::event first_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputIterIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class first_search_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, true, false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, ReductionOpT(), identity_val, + IndexOpT(), idx_identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, iter_nelems, + preferred_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class first_custom_search_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, true, + false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, ReductionOpT(), identity_val, + IndexOpT(), idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, reduction_nelems, + iter_nelems, preferred_reductions_per_wi)); + } + }); + + size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + + argTy *vals_temp_arg = partially_reduced_vals_tmp; + argTy *vals_temp2_arg = partially_reduced_vals_tmp2; + + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferred_reductions_per_wi * max_wg) { + size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; + ResIndexerT res_iter_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups_ * wg}; + auto localRange = sycl::range<1>{wg}; + if constexpr (can_use_reduce_over_group::value) { + using KernelName = + class middle_search_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, false, + false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, remaining_reduction_nelems, + iter_nelems, preferred_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class middle_custom_search_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, + false, false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + preferred_reductions_per_wi)); + } + }); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + std::swap(vals_temp_arg, vals_temp2_arg); + dependent_ev = partial_reduction_ev; + } + + // final reduction to res + sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; + ResIndexerT res_iter_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = + std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class final_search_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, false, true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + vals_temp_arg, nullptr, temp_arg, res_tp, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, remaining_reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class final_custom_search_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, false, + true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + vals_temp_arg, nullptr, temp_arg, res_tp, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + reductions_per_wi)); + } + }); + + sycl::event cleanup_host_task_event = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(final_reduction_ev); + sycl::context ctx = exec_q.get_context(); + + cgh.host_task( + [ctx, partially_reduced_tmp, partially_reduced_vals_tmp] { + sycl::free(partially_reduced_tmp, ctx); + sycl::free(partially_reduced_vals_tmp, ctx); + }); + }); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +template +sycl::event search_axis0_over_group_temps_contig_impl( + sycl::queue exec_q, + size_t iter_nelems, // number of reductions (num. of rows in a matrix + // when reducing over rows) + size_t reduction_nelems, // size of each reduction (length of rows, i.e. + // number of columns) + const char *arg_cp, + char *res_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + py::ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + constexpr argTy identity_val = su_ns::Identity::value; + constexpr resTy idx_identity_val = su_ns::Identity::value; + + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(idx_identity_val), iter_nelems, depends); + + return res_init_ev; + } + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{ + 0, static_cast(reduction_nelems), + static_cast(iter_nelems)}; + + using KernelName = + class search_seq_contig_krn; + + sycl::range<1> iter_range{iter_nelems}; + + cgh.parallel_for( + iter_range, + SequentialSearchReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems)); + }); + + return comp_ev; + } + + constexpr size_t preferred_reductions_per_wi = 8; + // max_max_wg prevents running out of resources on CPU + size_t max_wg = + std::min(size_t(2048), + d.get_info() / 2); + + size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; + + NoOpIndexerT columns_indexer{}; + NoOpIndexerT result_indexer{}; + InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + result_indexer}; + ReductionIndexerT reduction_indexer{ + 0, /* size */ static_cast(reduction_nelems), + /* step */ static_cast(iter_nelems)}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class single_search_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, true, true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + arg_tp, nullptr, nullptr, res_tp, ReductionOpT(), + identity_val, IndexOpT(), idx_identity_val, + in_out_iter_indexer, reduction_indexer, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class single_custom_search_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, true, + true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + arg_tp, nullptr, nullptr, res_tp, ReductionOpT(), + identity_val, IndexOpT(), idx_identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + }); return comp_ev; } else { // more than one work-groups is needed, requires a temporary size_t reduction_groups = - (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); size_t second_iter_reduction_groups_ = - (reduction_groups + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); resTy *partially_reduced_tmp = sycl::malloc_device( iter_nelems * (reduction_groups + second_iter_reduction_groups_), @@ -754,113 +5261,173 @@ sycl::event sum_reduction_over_group_temps_strided_impl( resTy *partially_reduced_tmp2 = nullptr; if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unabled to allocate device_memory"); + throw std::runtime_error("Unable to allocate device_memory"); } else { partially_reduced_tmp2 = partially_reduced_tmp + reduction_groups * iter_nelems; } - const sycl::event &first_reduction_ev = exec_q.submit([&](sycl::handler - &cgh) { + argTy *partially_reduced_vals_tmp = sycl::malloc_device( + iter_nelems * (reduction_groups + second_iter_reduction_groups_), + exec_q); + argTy *partially_reduced_vals_tmp2 = nullptr; + + if (partially_reduced_vals_tmp == nullptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + else { + partially_reduced_vals_tmp2 = + partially_reduced_vals_tmp + reduction_groups * iter_nelems; + } + + sycl::event first_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer; - using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; using InputOutputIterIndexerT = dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< - InputIndexerT, ResIndexerT>; - using ReductionIndexerT = - dpctl::tensor::offset_utils::StridedIndexer; - - // Only 2*iter_nd entries describing shape and strides of iterated - // dimensions of input array from iter_shape_and_strides are going - // to be accessed by inp_indexer - InputIndexerT inp_indexer(iter_nd, iter_arg_offset, - iter_shape_and_strides); - ResIndexerT noop_tmp_indexer{}; + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; - InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, - noop_tmp_indexer}; - ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, - reduction_shape_stride}; + NoOpIndexerT columns_indexer{}; + NoOpIndexerT result_indexer{}; + InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + result_indexer}; + ReductionIndexerT reduction_indexer{ + 0, /* size */ static_cast(reduction_nelems), + /* step */ static_cast(iter_nelems)}; auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - using KernelName = class sum_reduction_over_group_temps_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupNoAtomicFunctor( - arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class first_search_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, true, false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, ReductionOpT(), identity_val, + IndexOpT(), idx_identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, iter_nelems, + preferred_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class first_custom_search_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, true, + false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, ReductionOpT(), identity_val, + IndexOpT(), idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, reduction_nelems, + iter_nelems, preferred_reductions_per_wi)); + } }); size_t remaining_reduction_nelems = reduction_groups; resTy *temp_arg = partially_reduced_tmp; resTy *temp2_arg = partially_reduced_tmp2; + + argTy *vals_temp_arg = partially_reduced_vals_tmp; + argTy *vals_temp2_arg = partially_reduced_vals_tmp2; + sycl::event dependent_ev = first_reduction_ev; while (remaining_reduction_nelems > - preferrered_reductions_per_wi * max_wg) { - size_t reduction_groups_ = - (remaining_reduction_nelems + - preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + preferred_reductions_per_wi * max_wg) { + size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing - sycl::event partial_reduction_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dependent_ev); - - using InputIndexerT = - dpctl::tensor::offset_utils::Strided1DIndexer; - using ResIndexerT = - dpctl::tensor::offset_utils::NoOpIndexer; - using InputOutputIterIndexerT = - dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< - InputIndexerT, ResIndexerT>; - using ReductionIndexerT = - dpctl::tensor::offset_utils::NoOpIndexer; - - InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; - ResIndexerT res_iter_indexer{}; - - InputOutputIterIndexerT in_out_iter_indexer{ - inp_indexer, res_iter_indexer}; - ReductionIndexerT reduction_indexer{}; - - auto globalRange = - sycl::range<1>{iter_nelems * reduction_groups_ * wg}; - auto localRange = sycl::range<1>{wg}; - - using KernelName = class sum_reduction_over_group_temps_krn< - resTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; + sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; + ResIndexerT res_iter_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups_ * wg}; + auto localRange = sycl::range<1>{wg}; + if constexpr (can_use_reduce_over_group::value) { + using KernelName = + class middle_search_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, false, + false>; cgh.parallel_for( sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupNoAtomicFunctor< - resTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>( - temp_arg, temp2_arg, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, + SearchReduction( + vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, remaining_reduction_nelems, + iter_nelems, preferred_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class middle_custom_search_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, + false, false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, remaining_reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); - }); + preferred_reductions_per_wi)); + } + }); remaining_reduction_nelems = reduction_groups_; std::swap(temp_arg, temp2_arg); - dependent_ev = std::move(partial_reduction_ev); + std::swap(vals_temp_arg, vals_temp2_arg); + dependent_ev = partial_reduction_ev; } // final reduction to res @@ -868,8 +5435,7 @@ sycl::event sum_reduction_over_group_temps_strided_impl( cgh.depends_on(dependent_ev); using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; - using ResIndexerT = - dpctl::tensor::offset_utils::UnpackedStridedIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; using InputOutputIterIndexerT = dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< InputIndexerT, ResIndexerT>; @@ -878,10 +5444,7 @@ sycl::event sum_reduction_over_group_temps_strided_impl( InputIndexerT inp_indexer{ 0, static_cast(iter_nelems), static_cast(remaining_reduction_nelems)}; - ResIndexerT res_iter_indexer{iter_nd, iter_res_offset, - /* shape */ iter_shape_and_strides, - /*s trides */ iter_shape_and_strides + - 2 * iter_nd}; + ResIndexerT res_iter_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, res_iter_indexer}; @@ -900,28 +5463,54 @@ sycl::event sum_reduction_over_group_temps_strided_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - using KernelName = class sum_reduction_over_group_temps_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupNoAtomicFunctor( - temp_arg, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, - remaining_reduction_nelems, iter_nelems, - reductions_per_wi)); + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class final_search_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, false, true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + vals_temp_arg, nullptr, temp_arg, res_tp, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, remaining_reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class final_custom_search_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, false, + true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + vals_temp_arg, nullptr, temp_arg, res_tp, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + reductions_per_wi)); + } }); sycl::event cleanup_host_task_event = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(final_reduction_ev); - const sycl::context &ctx = exec_q.get_context(); + sycl::context ctx = exec_q.get_context(); - cgh.host_task([ctx, partially_reduced_tmp] { - sycl::free(partially_reduced_tmp, ctx); - }); + cgh.host_task( + [ctx, partially_reduced_tmp, partially_reduced_vals_tmp] { + sycl::free(partially_reduced_tmp, ctx); + sycl::free(partially_reduced_vals_tmp, ctx); + }); }); // FIXME: do not return host-task event @@ -931,183 +5520,88 @@ sycl::event sum_reduction_over_group_temps_strided_impl( } } -/* @brief Types supported by plus-reduction code based on atomic_ref */ -template -struct TypePairSupportDataForSumReductionAtomic -{ - - /* value if true a kernel for must be instantiated, false - * otherwise */ - static constexpr bool is_defined = std::disjunction< // disjunction is C++17 - // feature, supported - // by DPC++ input bool - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int8 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint8 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int16 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint16 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int32 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint32 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int64 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint64 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input half - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input float - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input double - td_ns::TypePairDefinedEntry, - // fall-through - td_ns::NotDefinedEntry>::is_defined; -}; - template -struct TypePairSupportDataForSumReductionTemps +struct TypePairSupportDataForSearchReductionTemps { static constexpr bool is_defined = std::disjunction< // disjunction is C++17 // feature, supported // by DPC++ input bool - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int8_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, // input uint8_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, // input int16_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, // input uint16_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, // input int32_t - td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, - // input uint32_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input int64_t td_ns::TypePairDefinedEntry, // input uint32_t - td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input half - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns:: - TypePairDefinedEntry>, - td_ns::TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry, // input float - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry>, - td_ns::TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry, // input double - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry, // input std::complex td_ns::TypePairDefinedEntry, outTy, - std::complex>, - td_ns::TypePairDefinedEntry, - outTy, - std::complex>, + std::int64_t>, td_ns::TypePairDefinedEntry, outTy, - std::complex>, + std::int64_t>, - // fall-throug + // fall-through td_ns::NotDefinedEntry>::is_defined; }; template -struct SumOverAxisAtomicStridedFactory +struct ArgmaxOverAxisTempsStridedFactory { fnT get() const { - if constexpr (TypePairSupportDataForSumReductionAtomic< + if constexpr (TypePairSupportDataForSearchReductionTemps< srcTy, dstTy>::is_defined) { - return dpctl::tensor::kernels:: - sum_reduction_over_group_with_atomics_strided_impl; + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } } else { return nullptr; @@ -1116,14 +5610,32 @@ struct SumOverAxisAtomicStridedFactory }; template -struct SumOverAxisTempsStridedFactory +struct ArgmaxOverAxis1TempsContigFactory { fnT get() const { - if constexpr (TypePairSupportDataForSumReductionTemps< - srcTy, dstTy>::is_defined) { - return dpctl::tensor::kernels:: - sum_reduction_over_group_temps_strided_impl; + if constexpr (TypePairSupportDataForSearchReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis1_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis1_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } } else { return nullptr; @@ -1132,16 +5644,32 @@ struct SumOverAxisTempsStridedFactory }; template -struct SumOverAxis1AtomicContigFactory +struct ArgmaxOverAxis0TempsContigFactory { fnT get() const { - if constexpr (TypePairSupportDataForSumReductionAtomic< + if constexpr (TypePairSupportDataForSearchReductionTemps< srcTy, dstTy>::is_defined) { - return dpctl::tensor::kernels:: - sum_reduction_axis1_over_group_with_atomics_contig_impl; + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis0_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis0_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } } else { return nullptr; @@ -1150,16 +5678,100 @@ struct SumOverAxis1AtomicContigFactory }; template -struct SumOverAxis0AtomicContigFactory +struct ArgminOverAxisTempsStridedFactory { fnT get() const { - if constexpr (TypePairSupportDataForSumReductionAtomic< + if constexpr (TypePairSupportDataForSearchReductionTemps< srcTy, dstTy>::is_defined) { - return dpctl::tensor::kernels:: - sum_reduction_axis0_over_group_with_atomics_contig_impl; + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct ArgminOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSearchReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis1_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis1_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct ArgminOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSearchReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis0_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis0_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } } else { return nullptr; diff --git a/dpctl/tensor/libtensor/include/kernels/repeat.hpp b/dpctl/tensor/libtensor/include/kernels/repeat.hpp index da1989fc3c..1f2335fc6c 100644 --- a/dpctl/tensor/libtensor/include/kernels/repeat.hpp +++ b/dpctl/tensor/libtensor/include/kernels/repeat.hpp @@ -46,14 +46,16 @@ namespace py = pybind11; using namespace dpctl::tensor::offset_utils; template class repeat_by_sequence_kernel; template @@ -66,8 +68,8 @@ class RepeatSequenceFunctor const repT *cumsum = nullptr; size_t src_axis_nelems = 1; OrthogIndexer orthog_strider; - AxisIndexer src_axis_strider; - AxisIndexer dst_axis_strider; + SrcAxisIndexer src_axis_strider; + DstAxisIndexer dst_axis_strider; RepIndexer reps_strider; public: @@ -77,8 +79,8 @@ class RepeatSequenceFunctor const repT *cumsum_, size_t src_axis_nelems_, OrthogIndexer orthog_strider_, - AxisIndexer src_axis_strider_, - AxisIndexer dst_axis_strider_, + SrcAxisIndexer src_axis_strider_, + DstAxisIndexer dst_axis_strider_, RepIndexer reps_strider_) : src(src_), dst(dst_), reps(reps_), cumsum(cumsum_), src_axis_nelems(src_axis_nelems_), orthog_strider(orthog_strider_), @@ -167,12 +169,12 @@ repeat_by_sequence_impl(sycl::queue &q, const size_t gws = orthog_nelems * src_axis_nelems; - cgh.parallel_for>( + cgh.parallel_for>( sycl::range<1>(gws), RepeatSequenceFunctor( + Strided1DIndexer, Strided1DIndexer, T, repT>( src_tp, dst_tp, reps_tp, cumsum_tp, src_axis_nelems, orthog_indexer, src_axis_indexer, dst_axis_indexer, reps_indexer)); @@ -197,8 +199,8 @@ typedef sycl::event (*repeat_by_sequence_1d_fn_ptr_t)( char *, const char *, const char *, - py::ssize_t, - py::ssize_t, + int, + const py::ssize_t *, py::ssize_t, py::ssize_t, py::ssize_t, @@ -212,8 +214,8 @@ sycl::event repeat_by_sequence_1d_impl(sycl::queue &q, char *dst_cp, const char *reps_cp, const char *cumsum_cp, - py::ssize_t src_shape, - py::ssize_t src_stride, + int src_nd, + const py::ssize_t *src_shape_strides, py::ssize_t dst_shape, py::ssize_t dst_stride, py::ssize_t reps_shape, @@ -231,19 +233,19 @@ sycl::event repeat_by_sequence_1d_impl(sycl::queue &q, // orthog ndim indexer TwoZeroOffsets_Indexer orthog_indexer{}; // indexers along repeated axis - Strided1DIndexer src_indexer{0, src_shape, src_stride}; + StridedIndexer src_indexer{src_nd, 0, src_shape_strides}; Strided1DIndexer dst_indexer{0, dst_shape, dst_stride}; // indexer along reps array Strided1DIndexer reps_indexer{0, reps_shape, reps_stride}; const size_t gws = src_nelems; - cgh.parallel_for< - repeat_by_sequence_kernel>( + cgh.parallel_for>( sycl::range<1>(gws), - RepeatSequenceFunctor( + RepeatSequenceFunctor( src_tp, dst_tp, reps_tp, cumsum_tp, src_nelems, orthog_indexer, src_indexer, dst_indexer, reps_indexer)); }); @@ -260,10 +262,16 @@ template struct RepeatSequence1DFactory } }; -template +template class repeat_by_scalar_kernel; -template +template class RepeatScalarFunctor { private: @@ -272,8 +280,8 @@ class RepeatScalarFunctor const py::ssize_t reps = 1; size_t dst_axis_nelems = 0; OrthogIndexer orthog_strider; - AxisIndexer src_axis_strider; - AxisIndexer dst_axis_strider; + SrcAxisIndexer src_axis_strider; + DstAxisIndexer dst_axis_strider; public: RepeatScalarFunctor(const T *src_, @@ -281,8 +289,8 @@ class RepeatScalarFunctor const py::ssize_t reps_, size_t dst_axis_nelems_, OrthogIndexer orthog_strider_, - AxisIndexer src_axis_strider_, - AxisIndexer dst_axis_strider_) + SrcAxisIndexer src_axis_strider_, + DstAxisIndexer dst_axis_strider_) : src(src_), dst(dst_), reps(reps_), dst_axis_nelems(dst_axis_nelems_), orthog_strider(orthog_strider_), src_axis_strider(src_axis_strider_), dst_axis_strider(dst_axis_strider_) @@ -354,10 +362,11 @@ sycl::event repeat_by_scalar_impl(sycl::queue &q, const size_t gws = orthog_nelems * dst_axis_nelems; - cgh.parallel_for>( + cgh.parallel_for>( sycl::range<1>(gws), - RepeatScalarFunctor( + RepeatScalarFunctor( src_tp, dst_tp, reps, dst_axis_nelems, orthog_indexer, src_axis_indexer, dst_axis_indexer)); }); @@ -380,8 +389,8 @@ typedef sycl::event (*repeat_by_scalar_1d_fn_ptr_t)( const char *, char *, const py::ssize_t, - py::ssize_t, - py::ssize_t, + int, + const py::ssize_t *, py::ssize_t, py::ssize_t, const std::vector &); @@ -392,8 +401,8 @@ sycl::event repeat_by_scalar_1d_impl(sycl::queue &q, const char *src_cp, char *dst_cp, const py::ssize_t reps, - py::ssize_t src_shape, - py::ssize_t src_stride, + int src_nd, + const py::ssize_t *src_shape_strides, py::ssize_t dst_shape, py::ssize_t dst_stride, const std::vector &depends) @@ -407,17 +416,18 @@ sycl::event repeat_by_scalar_1d_impl(sycl::queue &q, // orthog ndim indexer TwoZeroOffsets_Indexer orthog_indexer{}; // indexers along repeated axis - Strided1DIndexer src_indexer(0, src_shape, src_stride); + StridedIndexer src_indexer(src_nd, 0, src_shape_strides); Strided1DIndexer dst_indexer{0, dst_shape, dst_stride}; const size_t gws = dst_nelems; - cgh.parallel_for>( + cgh.parallel_for>( sycl::range<1>(gws), - RepeatScalarFunctor( - src_tp, dst_tp, reps, dst_nelems, orthog_indexer, src_indexer, - dst_indexer)); + RepeatScalarFunctor(src_tp, dst_tp, reps, + dst_nelems, orthog_indexer, + src_indexer, dst_indexer)); }); return repeat_ev; diff --git a/dpctl/tensor/libtensor/include/utils/math_utils.hpp b/dpctl/tensor/libtensor/include/utils/math_utils.hpp index d724e03e35..120a14d536 100644 --- a/dpctl/tensor/libtensor/include/utils/math_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/math_utils.hpp @@ -115,6 +115,26 @@ template T min_complex(const T &x1, const T &x2) return (std::isnan(real1) || isnan_imag1 || lt) ? x1 : x2; } +template T logaddexp(T x, T y) +{ + if (x == y) { // handle signed infinities + const T log2 = std::log(T(2)); + return x + log2; + } + else { + const T tmp = x - y; + if (tmp > 0) { + return x + std::log1p(std::exp(-tmp)); + } + else if (tmp <= 0) { + return y + std::log1p(std::exp(tmp)); + } + else { + return std::numeric_limits::quiet_NaN(); + } + } +} + } // namespace math_utils } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp index 2fc7b02efa..c0165b0ecc 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp @@ -26,14 +26,79 @@ #include #include #include +#include #include +#include "math_utils.hpp" + namespace dpctl { namespace tensor { namespace sycl_utils { +namespace detail +{ + +template struct TypeList; + +template struct TypeList +{ + using head = Head; + using tail = TypeList; +}; + +using NullTypeList = TypeList<>; +template +struct IsNullTypeList : std::conditional_t, + std::true_type, + std::false_type> +{ +}; + +// recursively check if type is contained in given TypeList +template +struct IsContained + : std::conditional_t< + std::is_same_v>, + std::true_type, + IsContained> +{ +}; + +template <> struct TypeList<> +{ +}; + +// std::false_type when last case has been checked for membership +template struct IsContained : std::false_type +{ +}; + +template struct IsComplex : std::false_type +{ +}; +template struct IsComplex> : std::true_type +{ +}; + +} // namespace detail + +template +using sycl_ops = detail::TypeList, + sycl::bit_or, + sycl::bit_xor, + sycl::bit_and, + sycl::maximum, + sycl::minimum, + sycl::multiplies>; + +template struct IsSyclOp +{ + static constexpr bool value = + detail::IsContained>>::value || + detail::IsContained>>::value; +}; /*! @brief Find the smallest multiple of supported sub-group size larger than * nelems */ @@ -66,6 +131,223 @@ size_t choose_workgroup_size(const size_t nelems, return wg; } +template +T custom_reduce_over_group(const GroupT &wg, + LocAccT local_mem_acc, + const T &local_val, + const OpT &op) +{ + size_t wgs = wg.get_local_linear_range(); + local_mem_acc[wg.get_local_linear_id()] = local_val; + + sycl::group_barrier(wg, sycl::memory_scope::work_group); + + T red_val_over_wg = local_mem_acc[0]; + if (wg.leader()) { + for (size_t i = 1; i < wgs; ++i) { + red_val_over_wg = op(red_val_over_wg, local_mem_acc[i]); + } + } + + sycl::group_barrier(wg, sycl::memory_scope::work_group); + + return sycl::group_broadcast(wg, red_val_over_wg); +} + +// Reduction functors + +// Maximum + +template struct Maximum +{ + T operator()(const T &x, const T &y) const + { + if constexpr (detail::IsComplex::value) { + using dpctl::tensor::math_utils::max_complex; + return max_complex(x, y); + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + return (std::isnan(x) || x > y) ? x : y; + } + else if constexpr (std::is_same_v) { + return x || y; + } + else { + return (x > y) ? x : y; + } + } +}; + +// Minimum + +template struct Minimum +{ + T operator()(const T &x, const T &y) const + { + if constexpr (detail::IsComplex::value) { + using dpctl::tensor::math_utils::min_complex; + return min_complex(x, y); + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + return (std::isnan(x) || x < y) ? x : y; + } + else if constexpr (std::is_same_v) { + return x && y; + } + else { + return (x < y) ? x : y; + } + } +}; + +// Define identities and operator checking structs + +template struct GetIdentity +{ +}; + +// Maximum + +template +using IsMaximum = std::bool_constant> || + std::is_same_v>>; + +template +struct GetIdentity::value>> +{ + static constexpr T value = + static_cast(std::numeric_limits::has_infinity + ? static_cast(-std::numeric_limits::infinity()) + : std::numeric_limits::lowest()); +}; + +template +struct GetIdentity::value>> +{ + static constexpr bool value = false; +}; + +template +struct GetIdentity, + std::enable_if_t, Op>::value>> +{ + static constexpr std::complex value{-std::numeric_limits::infinity(), + -std::numeric_limits::infinity()}; +}; + +// Minimum + +template +using IsMinimum = std::bool_constant> || + std::is_same_v>>; + +template +struct GetIdentity::value>> +{ + static constexpr T value = + static_cast(std::numeric_limits::has_infinity + ? static_cast(std::numeric_limits::infinity()) + : std::numeric_limits::max()); +}; + +template +struct GetIdentity::value>> +{ + static constexpr bool value = true; +}; + +template +struct GetIdentity, + std::enable_if_t, Op>::value>> +{ + static constexpr std::complex value{std::numeric_limits::infinity(), + std::numeric_limits::infinity()}; +}; + +// Plus + +template +using IsPlus = std::bool_constant> || + std::is_same_v>>; +// Multiplies + +template +using IsMultiplies = + std::bool_constant> || + std::is_same_v>>; + +template +struct GetIdentity::value>> +{ + static constexpr T value = static_cast(1); +}; + +// LogSumExp + +template struct LogSumExp +{ + T operator()(const T &x, const T &y) const + { + using dpctl::tensor::math_utils::logaddexp; + return logaddexp(x, y); + } +}; + +template +using IsLogSumExp = std::bool_constant>>; + +// only defined for types with infinity +template +struct GetIdentity::value>> +{ + static constexpr T value = -std::numeric_limits::infinity(); +}; + +// Hypot + +template struct Hypot +{ + T operator()(const T &x, const T &y) const + { + return sycl::hypot(x, y); + } +}; + +template +using IsHypot = std::bool_constant>>; + +template +struct GetIdentity::value>> +{ + static constexpr T value = 0; +}; + +// Identity + +template struct Identity +{ +}; + +template +using UseBuiltInIdentity = + std::conjunction, sycl::has_known_identity>; + +template +struct Identity::value>> +{ + static constexpr T value = GetIdentity::value; +}; + +template +struct Identity::value>> +{ + static constexpr T value = sycl::known_identity::value; +}; + } // namespace sycl_utils } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/clip.cpp b/dpctl/tensor/libtensor/source/clip.cpp new file mode 100644 index 0000000000..ac494c19ae --- /dev/null +++ b/dpctl/tensor/libtensor/source/clip.cpp @@ -0,0 +1,269 @@ +//===-- clip.cpp - Implementation of clip --*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines Python API for implementation functions of +/// dpctl.tensor.clip +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include +#include +#include + +#include "clip.hpp" +#include "kernels/clip.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::clip::clip_contig_impl_fn_ptr_t; +using dpctl::tensor::kernels::clip::clip_strided_impl_fn_ptr_t; + +static clip_contig_impl_fn_ptr_t clip_contig_dispatch_vector[td_ns::num_types]; +static clip_strided_impl_fn_ptr_t + clip_strided_dispatch_vector[td_ns::num_types]; + +void init_clip_dispatch_vectors(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::clip::ClipContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(clip_contig_dispatch_vector); + + using dpctl::tensor::kernels::clip::ClipStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(clip_strided_dispatch_vector); +} + +using dpctl::utils::keep_args_alive; + +std::pair +py_clip(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &min, + const dpctl::tensor::usm_ndarray &max, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, min, max, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + int nd = src.get_ndim(); + int min_nd = min.get_ndim(); + int max_nd = max.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (nd != min_nd || nd != max_nd) { + throw py::value_error( + "Input arrays are not of appropriate dimension for clip kernel."); + } + + if (nd != dst_nd) { + throw py::value_error( + "Destination is not of appropriate dimension for clip kernel."); + } + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *min_shape = min.get_shape_raw(); + const py::ssize_t *max_shape = max.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool shapes_equal(true); + size_t nelems(1); + for (int i = 0; i < nd; ++i) { + const auto &sh_i = dst_shape[i]; + nelems *= static_cast(sh_i); + shapes_equal = shapes_equal && (min_shape[i] == sh_i) && + (max_shape[i] == sh_i) && (src_shape[i] == sh_i); + } + + if (!shapes_equal) { + throw py::value_error("Arrays are not of matching shapes."); + } + + if (nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + auto const &same_logical_tensors = + dpctl::tensor::overlap::SameLogicalTensors(); + if ((overlap(dst, src) && !same_logical_tensors(dst, src)) || + (overlap(dst, min) && !same_logical_tensors(dst, min)) || + (overlap(dst, max) && !same_logical_tensors(dst, max))) + { + throw py::value_error("Destination array overlaps with input."); + } + + int min_typenum = min.get_typenum(); + int max_typenum = max.get_typenum(); + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int min_typeid = array_types.typenum_to_lookup_id(min_typenum); + int max_typeid = array_types.typenum_to_lookup_id(max_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_typeid != dst_typeid || src_typeid != min_typeid || + src_typeid != max_typeid) + { + throw py::value_error("Input, min, max, and destination arrays must " + "have the same data type"); + } + + // ensure that dst is sufficiently ample + auto dst_offsets = dst.get_minmax_offsets(); + // destination must be ample enough to accommodate all elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < static_cast(nelems)) { + throw py::value_error( + "Memory addressed by the destination array can not " + "accommodate all the " + "array elements."); + } + } + + char *src_data = src.get_data(); + char *min_data = min.get_data(); + char *max_data = max.get_data(); + char *dst_data = dst.get_data(); + + bool is_min_c_contig = min.is_c_contiguous(); + bool is_min_f_contig = min.is_f_contiguous(); + + bool is_max_c_contig = max.is_c_contiguous(); + bool is_max_f_contig = max.is_f_contiguous(); + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + bool all_c_contig = (is_min_c_contig && is_max_c_contig && + is_src_c_contig && is_dst_c_contig); + bool all_f_contig = (is_min_f_contig && is_max_f_contig && + is_src_f_contig && is_dst_f_contig); + + if (all_c_contig || all_f_contig) { + auto fn = clip_contig_dispatch_vector[src_typeid]; + + sycl::event clip_ev = + fn(exec_q, nelems, src_data, min_data, max_data, dst_data, depends); + sycl::event ht_ev = + keep_args_alive(exec_q, {src, min, max, dst}, {clip_ev}); + + return std::make_pair(ht_ev, clip_ev); + } + + auto const &src_strides = src.get_strides_vector(); + auto const &min_strides = min.get_strides_vector(); + auto const &max_strides = max.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_min_strides; + shT simplified_max_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t min_offset(0); + py::ssize_t max_offset(0); + py::ssize_t dst_offset(0); + + dpctl::tensor::py_internal::simplify_iteration_space_4( + nd, src_shape, src_strides, min_strides, max_strides, dst_strides, + // outputs + simplified_shape, simplified_src_strides, simplified_min_strides, + simplified_max_strides, simplified_dst_strides, src_offset, min_offset, + max_offset, dst_offset); + + auto fn = clip_strided_dispatch_vector[src_typeid]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, + // common shape and strides + simplified_shape, simplified_src_strides, simplified_min_strides, + simplified_max_strides, simplified_dst_strides); + py::ssize_t *packed_shape_strides = std::get<0>(ptr_size_event_tuple); + sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shape_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + sycl::event clip_ev = fn(exec_q, nelems, nd, src_data, min_data, max_data, + dst_data, packed_shape_strides, src_offset, + min_offset, max_offset, dst_offset, all_deps); + + // free packed temporaries + sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(clip_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([packed_shape_strides, ctx]() { + sycl::free(packed_shape_strides, ctx); + }); + }); + + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event arg_cleanup_ev = + keep_args_alive(exec_q, {src, min, max, dst}, host_task_events); + + return std::make_pair(arg_cleanup_ev, clip_ev); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/clip.hpp b/dpctl/tensor/libtensor/source/clip.hpp new file mode 100644 index 0000000000..d4b8af2cf5 --- /dev/null +++ b/dpctl/tensor/libtensor/source/clip.hpp @@ -0,0 +1,52 @@ +//===-- clip.hpp - --*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file declares Python API for implementation functions of +/// dpctl.tensor.clip +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpctl4pybind11.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair +py_clip(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &min, + const dpctl::tensor::usm_ndarray &max, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends); + +extern void init_clip_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.cpp b/dpctl/tensor/libtensor/source/elementwise_functions.cpp deleted file mode 100644 index cca0ac7c0a..0000000000 --- a/dpctl/tensor/libtensor/source/elementwise_functions.cpp +++ /dev/null @@ -1,4784 +0,0 @@ -//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// -// -// Data Parallel Control (dpctl) -// -// Copyright 2020-2023 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file defines functions of dpctl.tensor._tensor_impl extensions, -/// specifically functions for elementwise operations. -//===----------------------------------------------------------------------===// - -#include "dpctl4pybind11.hpp" -#include -#include -#include -#include -#include - -#include "elementwise_functions.hpp" -#include "utils/type_dispatch.hpp" - -#include "kernels/elementwise_functions/abs.hpp" -#include "kernels/elementwise_functions/acos.hpp" -#include "kernels/elementwise_functions/acosh.hpp" -#include "kernels/elementwise_functions/add.hpp" -#include "kernels/elementwise_functions/asin.hpp" -#include "kernels/elementwise_functions/asinh.hpp" -#include "kernels/elementwise_functions/atan.hpp" -#include "kernels/elementwise_functions/atan2.hpp" -#include "kernels/elementwise_functions/atanh.hpp" -#include "kernels/elementwise_functions/bitwise_and.hpp" -#include "kernels/elementwise_functions/bitwise_invert.hpp" -#include "kernels/elementwise_functions/bitwise_left_shift.hpp" -#include "kernels/elementwise_functions/bitwise_or.hpp" -#include "kernels/elementwise_functions/bitwise_right_shift.hpp" -#include "kernels/elementwise_functions/bitwise_xor.hpp" -#include "kernels/elementwise_functions/ceil.hpp" -#include "kernels/elementwise_functions/conj.hpp" -#include "kernels/elementwise_functions/cos.hpp" -#include "kernels/elementwise_functions/cosh.hpp" -#include "kernels/elementwise_functions/equal.hpp" -#include "kernels/elementwise_functions/exp.hpp" -#include "kernels/elementwise_functions/expm1.hpp" -#include "kernels/elementwise_functions/floor.hpp" -#include "kernels/elementwise_functions/floor_divide.hpp" -#include "kernels/elementwise_functions/greater.hpp" -#include "kernels/elementwise_functions/greater_equal.hpp" -#include "kernels/elementwise_functions/hypot.hpp" -#include "kernels/elementwise_functions/imag.hpp" -#include "kernels/elementwise_functions/isfinite.hpp" -#include "kernels/elementwise_functions/isinf.hpp" -#include "kernels/elementwise_functions/isnan.hpp" -#include "kernels/elementwise_functions/less.hpp" -#include "kernels/elementwise_functions/less_equal.hpp" -#include "kernels/elementwise_functions/log.hpp" -#include "kernels/elementwise_functions/log10.hpp" -#include "kernels/elementwise_functions/log1p.hpp" -#include "kernels/elementwise_functions/log2.hpp" -#include "kernels/elementwise_functions/logaddexp.hpp" -#include "kernels/elementwise_functions/logical_and.hpp" -#include "kernels/elementwise_functions/logical_not.hpp" -#include "kernels/elementwise_functions/logical_or.hpp" -#include "kernels/elementwise_functions/logical_xor.hpp" -#include "kernels/elementwise_functions/maximum.hpp" -#include "kernels/elementwise_functions/minimum.hpp" -#include "kernels/elementwise_functions/multiply.hpp" -#include "kernels/elementwise_functions/negative.hpp" -#include "kernels/elementwise_functions/not_equal.hpp" -#include "kernels/elementwise_functions/positive.hpp" -#include "kernels/elementwise_functions/pow.hpp" -#include "kernels/elementwise_functions/proj.hpp" -#include "kernels/elementwise_functions/real.hpp" -#include "kernels/elementwise_functions/remainder.hpp" -#include "kernels/elementwise_functions/round.hpp" -#include "kernels/elementwise_functions/sign.hpp" -#include "kernels/elementwise_functions/signbit.hpp" -#include "kernels/elementwise_functions/sin.hpp" -#include "kernels/elementwise_functions/sinh.hpp" -#include "kernels/elementwise_functions/sqrt.hpp" -#include "kernels/elementwise_functions/square.hpp" -#include "kernels/elementwise_functions/subtract.hpp" -#include "kernels/elementwise_functions/tan.hpp" -#include "kernels/elementwise_functions/tanh.hpp" -#include "kernels/elementwise_functions/true_divide.hpp" -#include "kernels/elementwise_functions/trunc.hpp" - -namespace dpctl -{ -namespace tensor -{ -namespace py_internal -{ - -namespace td_ns = dpctl::tensor::type_dispatch; - -py::dtype _dtype_from_typenum(td_ns::typenum_t dst_typenum_t) -{ - switch (dst_typenum_t) { - case td_ns::typenum_t::BOOL: - return py::dtype("?"); - case td_ns::typenum_t::INT8: - return py::dtype("i1"); - case td_ns::typenum_t::UINT8: - return py::dtype("u1"); - case td_ns::typenum_t::INT16: - return py::dtype("i2"); - case td_ns::typenum_t::UINT16: - return py::dtype("u2"); - case td_ns::typenum_t::INT32: - return py::dtype("i4"); - case td_ns::typenum_t::UINT32: - return py::dtype("u4"); - case td_ns::typenum_t::INT64: - return py::dtype("i8"); - case td_ns::typenum_t::UINT64: - return py::dtype("u8"); - case td_ns::typenum_t::HALF: - return py::dtype("f2"); - case td_ns::typenum_t::FLOAT: - return py::dtype("f4"); - case td_ns::typenum_t::DOUBLE: - return py::dtype("f8"); - case td_ns::typenum_t::CFLOAT: - return py::dtype("c8"); - case td_ns::typenum_t::CDOUBLE: - return py::dtype("c16"); - default: - throw py::value_error("Unrecognized dst_typeid"); - } -} - -int _result_typeid(int arg_typeid, const int *fn_output_id) -{ - if (arg_typeid < 0 || arg_typeid >= td_ns::num_types) { - throw py::value_error("Input typeid " + std::to_string(arg_typeid) + - " is outside of expected bounds."); - } - - return fn_output_id[arg_typeid]; -} - -namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; -using ew_cmn_ns::binary_contig_impl_fn_ptr_t; -using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; -using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; -using ew_cmn_ns::binary_strided_impl_fn_ptr_t; -using ew_cmn_ns::unary_contig_impl_fn_ptr_t; -using ew_cmn_ns::unary_strided_impl_fn_ptr_t; - -using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; -using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; -using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; - -// U01: ==== ABS (x) -namespace impl -{ - -namespace abs_fn_ns = dpctl::tensor::kernels::abs; - -static unary_contig_impl_fn_ptr_t abs_contig_dispatch_vector[td_ns::num_types]; -static int abs_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - abs_strided_dispatch_vector[td_ns::num_types]; - -void populate_abs_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = abs_fn_ns; - - using fn_ns::AbsContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(abs_contig_dispatch_vector); - - using fn_ns::AbsStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(abs_strided_dispatch_vector); - - using fn_ns::AbsTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(abs_output_typeid_vector); -}; - -} // namespace impl - -// U02: ==== ACOS (x) -namespace impl -{ - -namespace acos_fn_ns = dpctl::tensor::kernels::acos; - -static unary_contig_impl_fn_ptr_t acos_contig_dispatch_vector[td_ns::num_types]; -static int acos_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - acos_strided_dispatch_vector[td_ns::num_types]; - -void populate_acos_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = acos_fn_ns; - - using fn_ns::AcosContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(acos_contig_dispatch_vector); - - using fn_ns::AcosStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(acos_strided_dispatch_vector); - - using fn_ns::AcosTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(acos_output_typeid_vector); -} - -} // namespace impl - -// U03: ===== ACOSH (x) -namespace impl -{ - -namespace acosh_fn_ns = dpctl::tensor::kernels::acosh; - -static unary_contig_impl_fn_ptr_t - acosh_contig_dispatch_vector[td_ns::num_types]; -static int acosh_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - acosh_strided_dispatch_vector[td_ns::num_types]; - -void populate_acosh_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = acosh_fn_ns; - - using fn_ns::AcoshContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(acosh_contig_dispatch_vector); - - using fn_ns::AcoshStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(acosh_strided_dispatch_vector); - - using fn_ns::AcoshTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(acosh_output_typeid_vector); -} - -} // namespace impl - -// B01: ===== ADD (x1, x2) -namespace impl -{ -namespace add_fn_ns = dpctl::tensor::kernels::add; - -static binary_contig_impl_fn_ptr_t add_contig_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static int add_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - add_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -// add(matrix, row) -static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t - add_contig_matrix_contig_row_broadcast_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -// add(row, matrix) -static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t - add_contig_row_contig_matrix_broadcast_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -static binary_inplace_contig_impl_fn_ptr_t - add_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static binary_inplace_strided_impl_fn_ptr_t - add_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; -static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t - add_inplace_row_matrix_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_add_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = add_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::AddTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(add_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::AddStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(add_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::AddContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(add_contig_dispatch_table); - - // function pointers for operation on contiguous matrix, contiguous row - // with contiguous matrix output - using fn_ns::AddContigMatrixContigRowBroadcastFactory; - DispatchTableBuilder< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, - AddContigMatrixContigRowBroadcastFactory, num_types> - dtb4; - dtb4.populate_dispatch_table( - add_contig_matrix_contig_row_broadcast_dispatch_table); - - // function pointers for operation on contiguous row, contiguous matrix - // with contiguous matrix output - using fn_ns::AddContigRowContigMatrixBroadcastFactory; - DispatchTableBuilder< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, - AddContigRowContigMatrixBroadcastFactory, num_types> - dtb5; - dtb5.populate_dispatch_table( - add_contig_row_contig_matrix_broadcast_dispatch_table); - - // function pointers for inplace operation on general strided arrays - using fn_ns::AddInplaceStridedFactory; - DispatchTableBuilder - dtb6; - dtb6.populate_dispatch_table(add_inplace_strided_dispatch_table); - - // function pointers for inplace operation on contiguous inputs and output - using fn_ns::AddInplaceContigFactory; - DispatchTableBuilder - dtb7; - dtb7.populate_dispatch_table(add_inplace_contig_dispatch_table); - - // function pointers for inplace operation on contiguous matrix - // and contiguous row - using fn_ns::AddInplaceRowMatrixBroadcastFactory; - DispatchTableBuilder - dtb8; - dtb8.populate_dispatch_table(add_inplace_row_matrix_dispatch_table); -}; - -} // namespace impl - -// U04: ===== ASIN (x) -namespace impl -{ - -namespace asin_fn_ns = dpctl::tensor::kernels::asin; - -static unary_contig_impl_fn_ptr_t asin_contig_dispatch_vector[td_ns::num_types]; -static int asin_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - asin_strided_dispatch_vector[td_ns::num_types]; - -void populate_asin_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = asin_fn_ns; - - using fn_ns::AsinContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(asin_contig_dispatch_vector); - - using fn_ns::AsinStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(asin_strided_dispatch_vector); - - using fn_ns::AsinTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(asin_output_typeid_vector); -} - -} // namespace impl - -// U05: ===== ASINH (x) -namespace impl -{ - -namespace asinh_fn_ns = dpctl::tensor::kernels::asinh; - -static unary_contig_impl_fn_ptr_t - asinh_contig_dispatch_vector[td_ns::num_types]; -static int asinh_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - asinh_strided_dispatch_vector[td_ns::num_types]; - -void populate_asinh_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = asinh_fn_ns; - - using fn_ns::AsinhContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(asinh_contig_dispatch_vector); - - using fn_ns::AsinhStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(asinh_strided_dispatch_vector); - - using fn_ns::AsinhTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(asinh_output_typeid_vector); -} - -} // namespace impl - -// U06: ===== ATAN (x) -namespace impl -{ - -namespace atan_fn_ns = dpctl::tensor::kernels::atan; - -static unary_contig_impl_fn_ptr_t atan_contig_dispatch_vector[td_ns::num_types]; -static int atan_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - atan_strided_dispatch_vector[td_ns::num_types]; - -void populate_atan_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = atan_fn_ns; - - using fn_ns::AtanContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(atan_contig_dispatch_vector); - - using fn_ns::AtanStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(atan_strided_dispatch_vector); - - using fn_ns::AtanTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(atan_output_typeid_vector); -} - -} // namespace impl - -// B02: ===== ATAN2 (x1, x2) -namespace impl -{ -namespace atan2_fn_ns = dpctl::tensor::kernels::atan2; - -static binary_contig_impl_fn_ptr_t - atan2_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int atan2_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - atan2_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_atan2_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = atan2_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::Atan2TypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(atan2_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::Atan2StridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(atan2_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::Atan2ContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(atan2_contig_dispatch_table); -}; - -} // namespace impl - -// U07: ===== ATANH (x) -namespace impl -{ - -namespace atanh_fn_ns = dpctl::tensor::kernels::atanh; - -static unary_contig_impl_fn_ptr_t - atanh_contig_dispatch_vector[td_ns::num_types]; -static int atanh_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - atanh_strided_dispatch_vector[td_ns::num_types]; - -void populate_atanh_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = atanh_fn_ns; - - using fn_ns::AtanhContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(atanh_contig_dispatch_vector); - - using fn_ns::AtanhStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(atanh_strided_dispatch_vector); - - using fn_ns::AtanhTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(atanh_output_typeid_vector); -} - -} // namespace impl - -// B03: ===== BITWISE_AND (x1, x2) -namespace impl -{ -namespace bitwise_and_fn_ns = dpctl::tensor::kernels::bitwise_and; - -static binary_contig_impl_fn_ptr_t - bitwise_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int bitwise_and_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - bitwise_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_bitwise_and_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = bitwise_and_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::BitwiseAndTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(bitwise_and_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::BitwiseAndStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(bitwise_and_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::BitwiseAndContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(bitwise_and_contig_dispatch_table); -}; - -} // namespace impl - -// B04: ===== BITWISE_LEFT_SHIFT (x1, x2) -namespace impl -{ -namespace bitwise_left_shift_fn_ns = dpctl::tensor::kernels::bitwise_left_shift; - -static binary_contig_impl_fn_ptr_t - bitwise_left_shift_contig_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static int bitwise_left_shift_output_id_table[td_ns::num_types] - [td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - bitwise_left_shift_strided_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -void populate_bitwise_left_shift_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = bitwise_left_shift_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::BitwiseLeftShiftTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(bitwise_left_shift_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::BitwiseLeftShiftStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(bitwise_left_shift_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::BitwiseLeftShiftContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(bitwise_left_shift_contig_dispatch_table); -}; - -} // namespace impl - -// U08: ===== BITWISE_INVERT (x) -namespace impl -{ - -namespace bitwise_invert_fn_ns = dpctl::tensor::kernels::bitwise_invert; - -static unary_contig_impl_fn_ptr_t - bitwise_invert_contig_dispatch_vector[td_ns::num_types]; -static int bitwise_invert_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - bitwise_invert_strided_dispatch_vector[td_ns::num_types]; - -void populate_bitwise_invert_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = bitwise_invert_fn_ns; - - using fn_ns::BitwiseInvertContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(bitwise_invert_contig_dispatch_vector); - - using fn_ns::BitwiseInvertStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(bitwise_invert_strided_dispatch_vector); - - using fn_ns::BitwiseInvertTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(bitwise_invert_output_typeid_vector); -}; - -} // namespace impl - -// B05: ===== BITWISE_OR (x1, x2) -namespace impl -{ -namespace bitwise_or_fn_ns = dpctl::tensor::kernels::bitwise_or; - -static binary_contig_impl_fn_ptr_t - bitwise_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int bitwise_or_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - bitwise_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_bitwise_or_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = bitwise_or_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::BitwiseOrTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(bitwise_or_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::BitwiseOrStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(bitwise_or_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::BitwiseOrContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(bitwise_or_contig_dispatch_table); -}; -} // namespace impl - -// B06: ===== BITWISE_RIGHT_SHIFT (x1, x2) -namespace impl -{ -namespace bitwise_right_shift_fn_ns = - dpctl::tensor::kernels::bitwise_right_shift; - -static binary_contig_impl_fn_ptr_t - bitwise_right_shift_contig_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static int bitwise_right_shift_output_id_table[td_ns::num_types] - [td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - bitwise_right_shift_strided_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -void populate_bitwise_right_shift_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = bitwise_right_shift_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::BitwiseRightShiftTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(bitwise_right_shift_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::BitwiseRightShiftStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(bitwise_right_shift_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::BitwiseRightShiftContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(bitwise_right_shift_contig_dispatch_table); -}; - -} // namespace impl - -// B07: ===== BITWISE_XOR (x1, x2) -namespace impl -{ -namespace bitwise_xor_fn_ns = dpctl::tensor::kernels::bitwise_xor; - -static binary_contig_impl_fn_ptr_t - bitwise_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int bitwise_xor_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - bitwise_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_bitwise_xor_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = bitwise_xor_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::BitwiseXorTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(bitwise_xor_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::BitwiseXorStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(bitwise_xor_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::BitwiseXorContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(bitwise_xor_contig_dispatch_table); -}; -} // namespace impl - -// U09: ==== CEIL (x) -namespace impl -{ - -namespace ceil_fn_ns = dpctl::tensor::kernels::ceil; - -static unary_contig_impl_fn_ptr_t ceil_contig_dispatch_vector[td_ns::num_types]; -static int ceil_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - ceil_strided_dispatch_vector[td_ns::num_types]; - -void populate_ceil_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = ceil_fn_ns; - - using fn_ns::CeilContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(ceil_contig_dispatch_vector); - - using fn_ns::CeilStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(ceil_strided_dispatch_vector); - - using fn_ns::CeilTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(ceil_output_typeid_vector); -} - -} // namespace impl - -// U10: ==== CONJ (x) -namespace impl -{ - -namespace conj_fn_ns = dpctl::tensor::kernels::conj; - -static unary_contig_impl_fn_ptr_t conj_contig_dispatch_vector[td_ns::num_types]; -static int conj_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - conj_strided_dispatch_vector[td_ns::num_types]; - -void populate_conj_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = conj_fn_ns; - - using fn_ns::ConjContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(conj_contig_dispatch_vector); - - using fn_ns::ConjStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(conj_strided_dispatch_vector); - - using fn_ns::ConjTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(conj_output_typeid_vector); -} -} // namespace impl - -// U11: ==== COS (x) -namespace impl -{ - -namespace cos_fn_ns = dpctl::tensor::kernels::cos; - -static unary_contig_impl_fn_ptr_t cos_contig_dispatch_vector[td_ns::num_types]; -static int cos_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - cos_strided_dispatch_vector[td_ns::num_types]; - -void populate_cos_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = cos_fn_ns; - - using fn_ns::CosContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(cos_contig_dispatch_vector); - - using fn_ns::CosStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(cos_strided_dispatch_vector); - - using fn_ns::CosTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(cos_output_typeid_vector); -} - -} // namespace impl - -// U12: ==== COSH (x) -namespace impl -{ - -namespace cosh_fn_ns = dpctl::tensor::kernels::cosh; - -static unary_contig_impl_fn_ptr_t cosh_contig_dispatch_vector[td_ns::num_types]; -static int cosh_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - cosh_strided_dispatch_vector[td_ns::num_types]; - -void populate_cosh_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = cosh_fn_ns; - - using fn_ns::CoshContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(cosh_contig_dispatch_vector); - - using fn_ns::CoshStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(cosh_strided_dispatch_vector); - - using fn_ns::CoshTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(cosh_output_typeid_vector); -} - -} // namespace impl - -// B08: ==== DIVIDE (x1, x2) -namespace impl -{ -namespace true_divide_fn_ns = dpctl::tensor::kernels::true_divide; - -static binary_contig_impl_fn_ptr_t - true_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int true_divide_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - true_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -// divide(matrix, row) -static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t - true_divide_contig_matrix_contig_row_broadcast_dispatch_table - [td_ns::num_types][td_ns::num_types]; - -// divide(row, matrix) -static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t - true_divide_contig_row_contig_matrix_broadcast_dispatch_table - [td_ns::num_types][td_ns::num_types]; - -void populate_true_divide_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = true_divide_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::TrueDivideTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(true_divide_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::TrueDivideStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(true_divide_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::TrueDivideContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(true_divide_contig_dispatch_table); - - // function pointers for operation on contiguous matrix, contiguous row - // with contiguous matrix output - using fn_ns::TrueDivideContigMatrixContigRowBroadcastFactory; - DispatchTableBuilder< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, - TrueDivideContigMatrixContigRowBroadcastFactory, num_types> - dtb4; - dtb4.populate_dispatch_table( - true_divide_contig_matrix_contig_row_broadcast_dispatch_table); - - // function pointers for operation on contiguous row, contiguous matrix - // with contiguous matrix output - using fn_ns::TrueDivideContigRowContigMatrixBroadcastFactory; - DispatchTableBuilder< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, - TrueDivideContigRowContigMatrixBroadcastFactory, num_types> - dtb5; - dtb5.populate_dispatch_table( - true_divide_contig_row_contig_matrix_broadcast_dispatch_table); -}; - -} // namespace impl - -// B09: ==== EQUAL (x1, x2) -namespace impl -{ -namespace equal_fn_ns = dpctl::tensor::kernels::equal; - -static binary_contig_impl_fn_ptr_t - equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int equal_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_equal_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = equal_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::EqualTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(equal_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::EqualStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(equal_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::EqualContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(equal_contig_dispatch_table); -}; -} // namespace impl - -// U13: ==== EXP (x) -namespace impl -{ - -namespace exp_fn_ns = dpctl::tensor::kernels::exp; - -static unary_contig_impl_fn_ptr_t exp_contig_dispatch_vector[td_ns::num_types]; -static int exp_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - exp_strided_dispatch_vector[td_ns::num_types]; - -void populate_exp_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = exp_fn_ns; - - using fn_ns::ExpContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(exp_contig_dispatch_vector); - - using fn_ns::ExpStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(exp_strided_dispatch_vector); - - using fn_ns::ExpTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(exp_output_typeid_vector); -} - -} // namespace impl - -// U14: ==== EXPM1 (x) -namespace impl -{ - -namespace expm1_fn_ns = dpctl::tensor::kernels::expm1; - -static unary_contig_impl_fn_ptr_t - expm1_contig_dispatch_vector[td_ns::num_types]; -static int expm1_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - expm1_strided_dispatch_vector[td_ns::num_types]; - -void populate_expm1_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = expm1_fn_ns; - - using fn_ns::Expm1ContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(expm1_contig_dispatch_vector); - - using fn_ns::Expm1StridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(expm1_strided_dispatch_vector); - - using fn_ns::Expm1TypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(expm1_output_typeid_vector); -} - -} // namespace impl - -// U15: ==== FLOOR (x) -namespace impl -{ - -namespace floor_fn_ns = dpctl::tensor::kernels::floor; - -static unary_contig_impl_fn_ptr_t - floor_contig_dispatch_vector[td_ns::num_types]; -static int floor_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - floor_strided_dispatch_vector[td_ns::num_types]; - -void populate_floor_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = floor_fn_ns; - - using fn_ns::FloorContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(floor_contig_dispatch_vector); - - using fn_ns::FloorStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(floor_strided_dispatch_vector); - - using fn_ns::FloorTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(floor_output_typeid_vector); -} - -} // namespace impl - -// B10: ==== FLOOR_DIVIDE (x1, x2) -namespace impl -{ -namespace floor_divide_fn_ns = dpctl::tensor::kernels::floor_divide; - -static binary_contig_impl_fn_ptr_t - floor_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int floor_divide_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - floor_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_floor_divide_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = floor_divide_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::FloorDivideTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(floor_divide_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::FloorDivideStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(floor_divide_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::FloorDivideContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(floor_divide_contig_dispatch_table); -}; - -} // namespace impl - -// B11: ==== GREATER (x1, x2) -namespace impl -{ -namespace greater_fn_ns = dpctl::tensor::kernels::greater; - -static binary_contig_impl_fn_ptr_t - greater_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int greater_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - greater_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_greater_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = greater_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::GreaterTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(greater_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::GreaterStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(greater_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::GreaterContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(greater_contig_dispatch_table); -}; -} // namespace impl - -// B12: ==== GREATER_EQUAL (x1, x2) -namespace impl -{ -namespace greater_equal_fn_ns = dpctl::tensor::kernels::greater_equal; - -static binary_contig_impl_fn_ptr_t - greater_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int greater_equal_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - greater_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_greater_equal_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = greater_equal_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::GreaterEqualTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(greater_equal_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::GreaterEqualStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(greater_equal_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::GreaterEqualContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(greater_equal_contig_dispatch_table); -}; -} // namespace impl - -// U16: ==== IMAG (x) -namespace impl -{ - -namespace imag_fn_ns = dpctl::tensor::kernels::imag; - -static unary_contig_impl_fn_ptr_t imag_contig_dispatch_vector[td_ns::num_types]; -static int imag_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - imag_strided_dispatch_vector[td_ns::num_types]; - -void populate_imag_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = imag_fn_ns; - - using fn_ns::ImagContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(imag_contig_dispatch_vector); - - using fn_ns::ImagStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(imag_strided_dispatch_vector); - - using fn_ns::ImagTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(imag_output_typeid_vector); -} -} // namespace impl - -// U17: ==== ISFINITE (x) -namespace impl -{ -namespace isfinite_fn_ns = dpctl::tensor::kernels::isfinite; - -static unary_contig_impl_fn_ptr_t - isfinite_contig_dispatch_vector[td_ns::num_types]; -static int isfinite_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - isfinite_strided_dispatch_vector[td_ns::num_types]; - -void populate_isfinite_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = isfinite_fn_ns; - - using fn_ns::IsFiniteContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(isfinite_contig_dispatch_vector); - - using fn_ns::IsFiniteStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(isfinite_strided_dispatch_vector); - - using fn_ns::IsFiniteTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(isfinite_output_typeid_vector); -} - -} // namespace impl - -// U18: ==== ISINF (x) -namespace impl -{ -namespace isinf_fn_ns = dpctl::tensor::kernels::isinf; - -static unary_contig_impl_fn_ptr_t - isinf_contig_dispatch_vector[td_ns::num_types]; -static int isinf_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - isinf_strided_dispatch_vector[td_ns::num_types]; - -void populate_isinf_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = isinf_fn_ns; - - using fn_ns::IsInfContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(isinf_contig_dispatch_vector); - - using fn_ns::IsInfStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(isinf_strided_dispatch_vector); - - using fn_ns::IsInfTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(isinf_output_typeid_vector); -} - -} // namespace impl - -// U19: ==== ISNAN (x) -namespace impl -{ -namespace isnan_fn_ns = dpctl::tensor::kernels::isnan; - -static unary_contig_impl_fn_ptr_t - isnan_contig_dispatch_vector[td_ns::num_types]; -static int isnan_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - isnan_strided_dispatch_vector[td_ns::num_types]; - -void populate_isnan_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = isnan_fn_ns; - - using fn_ns::IsNanContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(isnan_contig_dispatch_vector); - - using fn_ns::IsNanStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(isnan_strided_dispatch_vector); - - using fn_ns::IsNanTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(isnan_output_typeid_vector); -} - -} // namespace impl - -// B13: ==== LESS (x1, x2) -namespace impl -{ -namespace less_fn_ns = dpctl::tensor::kernels::less; - -static binary_contig_impl_fn_ptr_t less_contig_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static int less_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - less_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_less_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = less_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::LessTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(less_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::LessStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(less_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::LessContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(less_contig_dispatch_table); -}; -} // namespace impl - -// B14: ==== LESS_EQUAL (x1, x2) -namespace impl -{ -namespace less_equal_fn_ns = dpctl::tensor::kernels::less_equal; - -static binary_contig_impl_fn_ptr_t - less_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int less_equal_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - less_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_less_equal_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = less_equal_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::LessEqualTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(less_equal_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::LessEqualStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(less_equal_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::LessEqualContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(less_equal_contig_dispatch_table); -}; -} // namespace impl - -// U20: ==== LOG (x) -namespace impl -{ - -namespace log_fn_ns = dpctl::tensor::kernels::log; - -static unary_contig_impl_fn_ptr_t log_contig_dispatch_vector[td_ns::num_types]; -static int log_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - log_strided_dispatch_vector[td_ns::num_types]; - -void populate_log_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = log_fn_ns; - - using fn_ns::LogContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(log_contig_dispatch_vector); - - using fn_ns::LogStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(log_strided_dispatch_vector); - - using fn_ns::LogTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(log_output_typeid_vector); -} - -} // namespace impl - -// U21: ==== LOG1P (x) -namespace impl -{ - -namespace log1p_fn_ns = dpctl::tensor::kernels::log1p; - -static unary_contig_impl_fn_ptr_t - log1p_contig_dispatch_vector[td_ns::num_types]; -static int log1p_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - log1p_strided_dispatch_vector[td_ns::num_types]; - -void populate_log1p_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = log1p_fn_ns; - - using fn_ns::Log1pContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(log1p_contig_dispatch_vector); - - using fn_ns::Log1pStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(log1p_strided_dispatch_vector); - - using fn_ns::Log1pTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(log1p_output_typeid_vector); -} - -} // namespace impl - -// U22: ==== LOG2 (x) -namespace impl -{ - -namespace log2_fn_ns = dpctl::tensor::kernels::log2; - -static unary_contig_impl_fn_ptr_t log2_contig_dispatch_vector[td_ns::num_types]; -static int log2_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - log2_strided_dispatch_vector[td_ns::num_types]; - -void populate_log2_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = log2_fn_ns; - - using fn_ns::Log2ContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(log2_contig_dispatch_vector); - - using fn_ns::Log2StridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(log2_strided_dispatch_vector); - - using fn_ns::Log2TypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(log2_output_typeid_vector); -}; - -} // namespace impl - -// U23: ==== LOG10 (x) -namespace impl -{ - -namespace log10_fn_ns = dpctl::tensor::kernels::log10; - -static unary_contig_impl_fn_ptr_t - log10_contig_dispatch_vector[td_ns::num_types]; -static int log10_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - log10_strided_dispatch_vector[td_ns::num_types]; - -void populate_log10_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = log10_fn_ns; - - using fn_ns::Log10ContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(log10_contig_dispatch_vector); - - using fn_ns::Log10StridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(log10_strided_dispatch_vector); - - using fn_ns::Log10TypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(log10_output_typeid_vector); -}; - -} // namespace impl - -// B15: ==== LOGADDEXP (x1, x2) -namespace impl -{ -namespace logaddexp_fn_ns = dpctl::tensor::kernels::logaddexp; - -static binary_contig_impl_fn_ptr_t - logaddexp_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int logaddexp_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - logaddexp_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_logaddexp_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = logaddexp_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::LogAddExpTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(logaddexp_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::LogAddExpStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(logaddexp_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::LogAddExpContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(logaddexp_contig_dispatch_table); -}; -} // namespace impl - -// B16: ==== LOGICAL_AND (x1, x2) -namespace impl -{ -namespace logical_and_fn_ns = dpctl::tensor::kernels::logical_and; - -static binary_contig_impl_fn_ptr_t - logical_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int logical_and_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - logical_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_logical_and_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = logical_and_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::LogicalAndTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(logical_and_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::LogicalAndStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(logical_and_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::LogicalAndContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(logical_and_contig_dispatch_table); -}; -} // namespace impl - -// U24: ==== LOGICAL_NOT (x) -namespace impl -{ -namespace logical_not_fn_ns = dpctl::tensor::kernels::logical_not; - -static unary_contig_impl_fn_ptr_t - logical_not_contig_dispatch_vector[td_ns::num_types]; -static int logical_not_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - logical_not_strided_dispatch_vector[td_ns::num_types]; - -void populate_logical_not_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = logical_not_fn_ns; - - using fn_ns::LogicalNotContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(logical_not_contig_dispatch_vector); - - using fn_ns::LogicalNotStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(logical_not_strided_dispatch_vector); - - using fn_ns::LogicalNotTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(logical_not_output_typeid_vector); -}; -} // namespace impl - -// B17: ==== LOGICAL_OR (x1, x2) -namespace impl -{ -namespace logical_or_fn_ns = dpctl::tensor::kernels::logical_or; - -static binary_contig_impl_fn_ptr_t - logical_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int logical_or_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - logical_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_logical_or_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = logical_or_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::LogicalOrTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(logical_or_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::LogicalOrStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(logical_or_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::LogicalOrContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(logical_or_contig_dispatch_table); -}; -} // namespace impl - -// B18: ==== LOGICAL_XOR (x1, x2) -namespace impl -{ -namespace logical_xor_fn_ns = dpctl::tensor::kernels::logical_xor; - -static binary_contig_impl_fn_ptr_t - logical_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int logical_xor_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - logical_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_logical_xor_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = logical_xor_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::LogicalXorTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(logical_xor_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::LogicalXorStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(logical_xor_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::LogicalXorContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(logical_xor_contig_dispatch_table); -}; -} // namespace impl - -// B??: ==== MAXIMUM (x1, x2) -namespace impl -{ - -namespace maximum_fn_ns = dpctl::tensor::kernels::maximum; - -static binary_contig_impl_fn_ptr_t - maximum_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int maximum_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - maximum_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_maximum_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = maximum_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::MaximumTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(maximum_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::MaximumStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(maximum_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::MaximumContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(maximum_contig_dispatch_table); -}; - -} // namespace impl - -// B??: ==== MINIMUM (x1, x2) -namespace impl -{ - -namespace minimum_fn_ns = dpctl::tensor::kernels::minimum; - -static binary_contig_impl_fn_ptr_t - minimum_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int minimum_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - minimum_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_minimum_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = minimum_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::MinimumTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(minimum_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::MinimumStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(minimum_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::MinimumContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(minimum_contig_dispatch_table); -}; - -} // namespace impl - -// B19: ==== MULTIPLY (x1, x2) -namespace impl -{ - -namespace multiply_fn_ns = dpctl::tensor::kernels::multiply; - -static binary_contig_impl_fn_ptr_t - multiply_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int multiply_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - multiply_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -// mul(matrix, row) -static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t - multiply_contig_matrix_contig_row_broadcast_dispatch_table - [td_ns::num_types][td_ns::num_types]; - -// mul(row, matrix) -static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t - multiply_contig_row_contig_matrix_broadcast_dispatch_table - [td_ns::num_types][td_ns::num_types]; - -static binary_inplace_contig_impl_fn_ptr_t - multiply_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static binary_inplace_strided_impl_fn_ptr_t - multiply_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; -static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t - multiply_inplace_row_matrix_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -void populate_multiply_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = multiply_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::MultiplyTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(multiply_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::MultiplyStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(multiply_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::MultiplyContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(multiply_contig_dispatch_table); - - // function pointers for operation on contiguous matrix, contiguous row - // with contiguous matrix output - using fn_ns::MultiplyContigMatrixContigRowBroadcastFactory; - DispatchTableBuilder< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, - MultiplyContigMatrixContigRowBroadcastFactory, num_types> - dtb4; - dtb4.populate_dispatch_table( - multiply_contig_matrix_contig_row_broadcast_dispatch_table); - - // function pointers for operation on contiguous row, contiguous matrix - // with contiguous matrix output - using fn_ns::MultiplyContigRowContigMatrixBroadcastFactory; - DispatchTableBuilder< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, - MultiplyContigRowContigMatrixBroadcastFactory, num_types> - dtb5; - dtb5.populate_dispatch_table( - multiply_contig_row_contig_matrix_broadcast_dispatch_table); - - // function pointers for inplace operation on general strided arrays - using fn_ns::MultiplyInplaceStridedFactory; - DispatchTableBuilder - dtb6; - dtb6.populate_dispatch_table(multiply_inplace_strided_dispatch_table); - - // function pointers for inplace operation on contiguous inputs and output - using fn_ns::MultiplyInplaceContigFactory; - DispatchTableBuilder - dtb7; - dtb7.populate_dispatch_table(multiply_inplace_contig_dispatch_table); - - // function pointers for inplace operation on contiguous matrix - // and contiguous row - using fn_ns::MultiplyInplaceRowMatrixBroadcastFactory; - DispatchTableBuilder - dtb8; - dtb8.populate_dispatch_table(multiply_inplace_row_matrix_dispatch_table); -}; - -} // namespace impl - -// U25: ==== NEGATIVE (x) -namespace impl -{ - -namespace negative_fn_ns = dpctl::tensor::kernels::negative; - -static unary_contig_impl_fn_ptr_t - negative_contig_dispatch_vector[td_ns::num_types]; -static int negative_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - negative_strided_dispatch_vector[td_ns::num_types]; - -void populate_negative_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = negative_fn_ns; - - using fn_ns::NegativeContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(negative_contig_dispatch_vector); - - using fn_ns::NegativeStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(negative_strided_dispatch_vector); - - using fn_ns::NegativeTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(negative_output_typeid_vector); -} - -} // namespace impl - -// B20: ==== NOT_EQUAL (x1, x2) -namespace impl -{ -namespace not_equal_fn_ns = dpctl::tensor::kernels::not_equal; - -static binary_contig_impl_fn_ptr_t - not_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int not_equal_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - not_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_not_equal_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = not_equal_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::NotEqualTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(not_equal_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::NotEqualStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(not_equal_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::NotEqualContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(not_equal_contig_dispatch_table); -}; -} // namespace impl - -// U26: ==== POSITIVE (x) -namespace impl -{ - -namespace positive_fn_ns = dpctl::tensor::kernels::positive; - -static unary_contig_impl_fn_ptr_t - positive_contig_dispatch_vector[td_ns::num_types]; -static int positive_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - positive_strided_dispatch_vector[td_ns::num_types]; - -void populate_positive_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = positive_fn_ns; - - using fn_ns::PositiveContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(positive_contig_dispatch_vector); - - using fn_ns::PositiveStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(positive_strided_dispatch_vector); - - using fn_ns::PositiveTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(positive_output_typeid_vector); -} - -} // namespace impl - -// B21: ==== POW (x1, x2) -namespace impl -{ - -namespace pow_fn_ns = dpctl::tensor::kernels::pow; - -static binary_contig_impl_fn_ptr_t pow_contig_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static int pow_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - pow_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_pow_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = pow_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::PowTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(pow_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::PowStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(pow_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::PowContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(pow_contig_dispatch_table); -}; - -} // namespace impl - -// U??: ==== PROJ (x) -namespace impl -{ - -namespace proj_fn_ns = dpctl::tensor::kernels::proj; - -static unary_contig_impl_fn_ptr_t proj_contig_dispatch_vector[td_ns::num_types]; -static int proj_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - proj_strided_dispatch_vector[td_ns::num_types]; - -void populate_proj_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = proj_fn_ns; - - using fn_ns::ProjContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(proj_contig_dispatch_vector); - - using fn_ns::ProjStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(proj_strided_dispatch_vector); - - using fn_ns::ProjTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(proj_output_typeid_vector); -} -} // namespace impl - -// U27: ==== REAL (x) -namespace impl -{ - -namespace real_fn_ns = dpctl::tensor::kernels::real; - -static unary_contig_impl_fn_ptr_t real_contig_dispatch_vector[td_ns::num_types]; -static int real_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - real_strided_dispatch_vector[td_ns::num_types]; - -void populate_real_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = real_fn_ns; - - using fn_ns::RealContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(real_contig_dispatch_vector); - - using fn_ns::RealStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(real_strided_dispatch_vector); - - using fn_ns::RealTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(real_output_typeid_vector); -} -} // namespace impl - -// B22: ==== REMAINDER (x1, x2) -namespace impl -{ - -namespace remainder_fn_ns = dpctl::tensor::kernels::remainder; - -static binary_contig_impl_fn_ptr_t - remainder_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int remainder_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - remainder_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_remainder_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = remainder_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::RemainderTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(remainder_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::RemainderStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(remainder_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::RemainderContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(remainder_contig_dispatch_table); -} - -} // namespace impl - -// U28: ==== ROUND (x) -namespace impl -{ - -namespace round_fn_ns = dpctl::tensor::kernels::round; - -static unary_contig_impl_fn_ptr_t - round_contig_dispatch_vector[td_ns::num_types]; -static int round_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - round_strided_dispatch_vector[td_ns::num_types]; - -void populate_round_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = round_fn_ns; - - using fn_ns::RoundContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(round_contig_dispatch_vector); - - using fn_ns::RoundStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(round_strided_dispatch_vector); - - using fn_ns::RoundTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(round_output_typeid_vector); -} - -} // namespace impl - -// U29: ==== SIGN (x) -namespace impl -{ - -namespace sign_fn_ns = dpctl::tensor::kernels::sign; - -static unary_contig_impl_fn_ptr_t sign_contig_dispatch_vector[td_ns::num_types]; -static int sign_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - sign_strided_dispatch_vector[td_ns::num_types]; - -void populate_sign_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = sign_fn_ns; - - using fn_ns::SignContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(sign_contig_dispatch_vector); - - using fn_ns::SignStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(sign_strided_dispatch_vector); - - using fn_ns::SignTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(sign_output_typeid_vector); -} - -} // namespace impl - -// ==== SIGNBIT (x) -namespace impl -{ - -namespace signbit_fn_ns = dpctl::tensor::kernels::signbit; - -static unary_contig_impl_fn_ptr_t - signbit_contig_dispatch_vector[td_ns::num_types]; -static int signbit_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - signbit_strided_dispatch_vector[td_ns::num_types]; - -void populate_signbit_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = signbit_fn_ns; - - using fn_ns::SignbitContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(signbit_contig_dispatch_vector); - - using fn_ns::SignbitStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(signbit_strided_dispatch_vector); - - using fn_ns::SignbitTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(signbit_output_typeid_vector); -} - -} // namespace impl - -// U30: ==== SIN (x) -namespace impl -{ - -namespace sin_fn_ns = dpctl::tensor::kernels::sin; - -static unary_contig_impl_fn_ptr_t sin_contig_dispatch_vector[td_ns::num_types]; -static int sin_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - sin_strided_dispatch_vector[td_ns::num_types]; - -void populate_sin_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = sin_fn_ns; - - using fn_ns::SinContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(sin_contig_dispatch_vector); - - using fn_ns::SinStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(sin_strided_dispatch_vector); - - using fn_ns::SinTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(sin_output_typeid_vector); -} - -} // namespace impl - -// U31: ==== SINH (x) -namespace impl -{ - -namespace sinh_fn_ns = dpctl::tensor::kernels::sinh; - -static unary_contig_impl_fn_ptr_t sinh_contig_dispatch_vector[td_ns::num_types]; -static int sinh_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - sinh_strided_dispatch_vector[td_ns::num_types]; - -void populate_sinh_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = sinh_fn_ns; - - using fn_ns::SinhContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(sinh_contig_dispatch_vector); - - using fn_ns::SinhStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(sinh_strided_dispatch_vector); - - using fn_ns::SinhTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(sinh_output_typeid_vector); -} - -} // namespace impl - -// U32: ==== SQUARE (x) -namespace impl -{ - -namespace square_fn_ns = dpctl::tensor::kernels::square; - -static unary_contig_impl_fn_ptr_t - square_contig_dispatch_vector[td_ns::num_types]; -static int square_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - square_strided_dispatch_vector[td_ns::num_types]; - -void populate_square_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = square_fn_ns; - - using fn_ns::SquareContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(square_contig_dispatch_vector); - - using fn_ns::SquareStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(square_strided_dispatch_vector); - - using fn_ns::SquareTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(square_output_typeid_vector); -} - -} // namespace impl - -// U33: ==== SQRT (x) -namespace impl -{ - -namespace sqrt_fn_ns = dpctl::tensor::kernels::sqrt; - -static unary_contig_impl_fn_ptr_t sqrt_contig_dispatch_vector[td_ns::num_types]; -static int sqrt_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - sqrt_strided_dispatch_vector[td_ns::num_types]; - -void populate_sqrt_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = sqrt_fn_ns; - - using fn_ns::SqrtContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(sqrt_contig_dispatch_vector); - - using fn_ns::SqrtStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(sqrt_strided_dispatch_vector); - - using fn_ns::SqrtTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(sqrt_output_typeid_vector); -} - -} // namespace impl - -// B23: ==== SUBTRACT (x1, x2) -namespace impl -{ -namespace subtract_fn_ns = dpctl::tensor::kernels::subtract; - -static binary_contig_impl_fn_ptr_t - subtract_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int subtract_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - subtract_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -// sub(matrix, row) -static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t - subtract_contig_matrix_contig_row_broadcast_dispatch_table - [td_ns::num_types][td_ns::num_types]; - -// sub(row, matrix) -static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t - subtract_contig_row_contig_matrix_broadcast_dispatch_table - [td_ns::num_types][td_ns::num_types]; - -static binary_inplace_contig_impl_fn_ptr_t - subtract_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static binary_inplace_strided_impl_fn_ptr_t - subtract_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; -static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t - subtract_inplace_row_matrix_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -void populate_subtract_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = subtract_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::SubtractTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(subtract_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::SubtractStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(subtract_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::SubtractContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(subtract_contig_dispatch_table); - - // function pointers for operation on contiguous matrix, contiguous row - // with contiguous matrix output - using fn_ns::SubtractContigMatrixContigRowBroadcastFactory; - DispatchTableBuilder< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, - SubtractContigMatrixContigRowBroadcastFactory, num_types> - dtb4; - dtb4.populate_dispatch_table( - subtract_contig_matrix_contig_row_broadcast_dispatch_table); - - // function pointers for operation on contiguous row, contiguous matrix - // with contiguous matrix output - using fn_ns::SubtractContigRowContigMatrixBroadcastFactory; - DispatchTableBuilder< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, - SubtractContigRowContigMatrixBroadcastFactory, num_types> - dtb5; - dtb5.populate_dispatch_table( - subtract_contig_row_contig_matrix_broadcast_dispatch_table); - - // function pointers for inplace operation on general strided arrays - using fn_ns::SubtractInplaceStridedFactory; - DispatchTableBuilder - dtb6; - dtb6.populate_dispatch_table(subtract_inplace_strided_dispatch_table); - - // function pointers for inplace operation on contiguous inputs and output - using fn_ns::SubtractInplaceContigFactory; - DispatchTableBuilder - dtb7; - dtb7.populate_dispatch_table(subtract_inplace_contig_dispatch_table); - - // function pointers for inplace operation on contiguous matrix - // and contiguous row - using fn_ns::SubtractInplaceRowMatrixBroadcastFactory; - DispatchTableBuilder - dtb8; - dtb8.populate_dispatch_table(subtract_inplace_row_matrix_dispatch_table); -}; - -} // namespace impl - -// U34: ==== TAN (x) -namespace impl -{ - -namespace tan_fn_ns = dpctl::tensor::kernels::tan; - -static unary_contig_impl_fn_ptr_t tan_contig_dispatch_vector[td_ns::num_types]; -static int tan_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - tan_strided_dispatch_vector[td_ns::num_types]; - -void populate_tan_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = tan_fn_ns; - - using fn_ns::TanContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(tan_contig_dispatch_vector); - - using fn_ns::TanStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(tan_strided_dispatch_vector); - - using fn_ns::TanTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(tan_output_typeid_vector); -} - -} // namespace impl - -// U35: ==== TANH (x) -namespace impl -{ - -namespace tanh_fn_ns = dpctl::tensor::kernels::tanh; - -static unary_contig_impl_fn_ptr_t tanh_contig_dispatch_vector[td_ns::num_types]; -static int tanh_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - tanh_strided_dispatch_vector[td_ns::num_types]; - -void populate_tanh_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = tanh_fn_ns; - - using fn_ns::TanhContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(tanh_contig_dispatch_vector); - - using fn_ns::TanhStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(tanh_strided_dispatch_vector); - - using fn_ns::TanhTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(tanh_output_typeid_vector); -} - -} // namespace impl - -// U36: ==== TRUNC (x) -namespace impl -{ - -namespace trunc_fn_ns = dpctl::tensor::kernels::trunc; - -static unary_contig_impl_fn_ptr_t - trunc_contig_dispatch_vector[td_ns::num_types]; -static int trunc_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - trunc_strided_dispatch_vector[td_ns::num_types]; - -void populate_trunc_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = trunc_fn_ns; - - using fn_ns::TruncContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(trunc_contig_dispatch_vector); - - using fn_ns::TruncStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(trunc_strided_dispatch_vector); - - using fn_ns::TruncTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(trunc_output_typeid_vector); -} - -} // namespace impl - -// B24: ==== HYPOT (x1, x2) - -namespace impl -{ -namespace hypot_fn_ns = dpctl::tensor::kernels::hypot; - -static binary_contig_impl_fn_ptr_t - hypot_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int hypot_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - hypot_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_hypot_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = hypot_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::HypotTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(hypot_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::HypotStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(hypot_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::HypotContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(hypot_contig_dispatch_table); -}; - -} // namespace impl - -// ========================================================================================== -// // - -namespace py = pybind11; - -void init_elementwise_functions(py::module_ m) -{ - using arrayT = dpctl::tensor::usm_ndarray; - using event_vecT = std::vector; - - // U01: ==== ABS (x) - { - impl::populate_abs_dispatch_vectors(); - using impl::abs_contig_dispatch_vector; - using impl::abs_output_typeid_vector; - using impl::abs_strided_dispatch_vector; - - auto abs_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, abs_output_typeid_vector, - abs_contig_dispatch_vector, abs_strided_dispatch_vector); - }; - m.def("_abs", abs_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto abs_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, abs_output_typeid_vector); - }; - m.def("_abs_result_type", abs_result_type_pyapi); - } - - // U02: ==== ACOS (x) - { - impl::populate_acos_dispatch_vectors(); - using impl::acos_contig_dispatch_vector; - using impl::acos_output_typeid_vector; - using impl::acos_strided_dispatch_vector; - - auto acos_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, acos_output_typeid_vector, - acos_contig_dispatch_vector, acos_strided_dispatch_vector); - }; - m.def("_acos", acos_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto acos_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, acos_output_typeid_vector); - }; - m.def("_acos_result_type", acos_result_type_pyapi); - } - - // U03: ===== ACOSH (x) - { - impl::populate_acosh_dispatch_vectors(); - using impl::acosh_contig_dispatch_vector; - using impl::acosh_output_typeid_vector; - using impl::acosh_strided_dispatch_vector; - - auto acosh_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, acosh_output_typeid_vector, - acosh_contig_dispatch_vector, acosh_strided_dispatch_vector); - }; - m.def("_acosh", acosh_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto acosh_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - acosh_output_typeid_vector); - }; - m.def("_acosh_result_type", acosh_result_type_pyapi); - } - - // B01: ===== ADD (x1, x2) - { - impl::populate_add_dispatch_tables(); - using impl::add_contig_dispatch_table; - using impl::add_contig_matrix_contig_row_broadcast_dispatch_table; - using impl::add_contig_row_contig_matrix_broadcast_dispatch_table; - using impl::add_output_id_table; - using impl::add_strided_dispatch_table; - - auto add_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, add_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - add_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - add_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - add_contig_matrix_contig_row_broadcast_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - add_contig_row_contig_matrix_broadcast_dispatch_table); - }; - auto add_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - add_output_id_table); - }; - m.def("_add", add_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_add_result_type", add_result_type_pyapi, ""); - - using impl::add_inplace_contig_dispatch_table; - using impl::add_inplace_row_matrix_dispatch_table; - using impl::add_inplace_strided_dispatch_table; - - auto add_inplace_pyapi = - [&](const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_inplace_ufunc( - src, dst, exec_q, depends, add_output_id_table, - // function pointers to handle inplace operation on - // contiguous arrays (pointers may be nullptr) - add_inplace_contig_dispatch_table, - // function pointers to handle inplace operation on strided - // arrays (most general case) - add_inplace_strided_dispatch_table, - // function pointers to handle inplace operation on - // c-contig matrix with c-contig row with broadcasting - // (may be nullptr) - add_inplace_row_matrix_dispatch_table); - }; - m.def("_add_inplace", add_inplace_pyapi, "", py::arg("lhs"), - py::arg("rhs"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - } - - // U04: ===== ASIN (x) - { - impl::populate_asin_dispatch_vectors(); - using impl::asin_contig_dispatch_vector; - using impl::asin_output_typeid_vector; - using impl::asin_strided_dispatch_vector; - - auto asin_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, asin_output_typeid_vector, - asin_contig_dispatch_vector, asin_strided_dispatch_vector); - }; - m.def("_asin", asin_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto asin_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, asin_output_typeid_vector); - }; - m.def("_asin_result_type", asin_result_type_pyapi); - } - - // U05: ===== ASINH (x) - { - impl::populate_asinh_dispatch_vectors(); - using impl::asinh_contig_dispatch_vector; - using impl::asinh_output_typeid_vector; - using impl::asinh_strided_dispatch_vector; - - auto asinh_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, asinh_output_typeid_vector, - asinh_contig_dispatch_vector, asinh_strided_dispatch_vector); - }; - m.def("_asinh", asinh_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto asinh_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - asinh_output_typeid_vector); - }; - m.def("_asinh_result_type", asinh_result_type_pyapi); - } - - // U06: ===== ATAN (x) - { - impl::populate_atan_dispatch_vectors(); - using impl::atan_contig_dispatch_vector; - using impl::atan_output_typeid_vector; - using impl::atan_strided_dispatch_vector; - - auto atan_pyapi = [&](arrayT src, arrayT dst, sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, atan_output_typeid_vector, - atan_contig_dispatch_vector, atan_strided_dispatch_vector); - }; - m.def("_atan", atan_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto atan_result_type_pyapi = [&](py::dtype dtype) { - return py_unary_ufunc_result_type(dtype, atan_output_typeid_vector); - }; - m.def("_atan_result_type", atan_result_type_pyapi); - } - - // B02: ===== ATAN2 (x1, x2) - { - impl::populate_atan2_dispatch_tables(); - using impl::atan2_contig_dispatch_table; - using impl::atan2_output_id_table; - using impl::atan2_strided_dispatch_table; - - auto atan2_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, atan2_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - atan2_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - atan2_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto atan2_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - atan2_output_id_table); - }; - m.def("_atan2", atan2_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_atan2_result_type", atan2_result_type_pyapi, ""); - } - - // U07: ===== ATANH (x) - { - impl::populate_atanh_dispatch_vectors(); - using impl::atanh_contig_dispatch_vector; - using impl::atanh_output_typeid_vector; - using impl::atanh_strided_dispatch_vector; - - auto atanh_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, atanh_output_typeid_vector, - atanh_contig_dispatch_vector, atanh_strided_dispatch_vector); - }; - m.def("_atanh", atanh_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto atanh_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - atanh_output_typeid_vector); - }; - m.def("_atanh_result_type", atanh_result_type_pyapi); - } - - // B03: ===== BITWISE_AND (x1, x2) - { - impl::populate_bitwise_and_dispatch_tables(); - using impl::bitwise_and_contig_dispatch_table; - using impl::bitwise_and_output_id_table; - using impl::bitwise_and_strided_dispatch_table; - - auto bitwise_and_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, bitwise_and_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - bitwise_and_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - bitwise_and_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto bitwise_and_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - bitwise_and_output_id_table); - }; - m.def("_bitwise_and", bitwise_and_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_bitwise_and_result_type", bitwise_and_result_type_pyapi, ""); - } - - // B04: ===== BITWISE_LEFT_SHIFT (x1, x2) - { - impl::populate_bitwise_left_shift_dispatch_tables(); - using impl::bitwise_left_shift_contig_dispatch_table; - using impl::bitwise_left_shift_output_id_table; - using impl::bitwise_left_shift_strided_dispatch_table; - - auto bitwise_left_shift_pyapi = [&](const dpctl::tensor::usm_ndarray - &src1, - const dpctl::tensor::usm_ndarray - &src2, - const dpctl::tensor::usm_ndarray - &dst, - sycl::queue &exec_q, - const std::vector - &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, - bitwise_left_shift_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - bitwise_left_shift_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - bitwise_left_shift_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto bitwise_left_shift_result_type_pyapi = - [&](const py::dtype &dtype1, const py::dtype &dtype2) { - return py_binary_ufunc_result_type( - dtype1, dtype2, bitwise_left_shift_output_id_table); - }; - m.def("_bitwise_left_shift", bitwise_left_shift_pyapi, "", - py::arg("src1"), py::arg("src2"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_bitwise_left_shift_result_type", - bitwise_left_shift_result_type_pyapi, ""); - } - - // U08: ===== BITWISE_INVERT (x) - { - impl::populate_bitwise_invert_dispatch_vectors(); - using impl::bitwise_invert_contig_dispatch_vector; - using impl::bitwise_invert_output_typeid_vector; - using impl::bitwise_invert_strided_dispatch_vector; - - auto bitwise_invert_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc(src, dst, exec_q, depends, - bitwise_invert_output_typeid_vector, - bitwise_invert_contig_dispatch_vector, - bitwise_invert_strided_dispatch_vector); - }; - m.def("_bitwise_invert", bitwise_invert_pyapi, "", py::arg("src"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - - auto bitwise_invert_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type( - dtype, bitwise_invert_output_typeid_vector); - }; - m.def("_bitwise_invert_result_type", bitwise_invert_result_type_pyapi); - } - - // B05: ===== BITWISE_OR (x1, x2) - { - impl::populate_bitwise_or_dispatch_tables(); - using impl::bitwise_or_contig_dispatch_table; - using impl::bitwise_or_output_id_table; - using impl::bitwise_or_strided_dispatch_table; - - auto bitwise_or_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, bitwise_or_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - bitwise_or_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - bitwise_or_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto bitwise_or_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - bitwise_or_output_id_table); - }; - m.def("_bitwise_or", bitwise_or_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_bitwise_or_result_type", bitwise_or_result_type_pyapi, ""); - } - - // B06: ===== BITWISE_RIGHT_SHIFT (x1, x2) - { - impl::populate_bitwise_right_shift_dispatch_tables(); - using impl::bitwise_right_shift_contig_dispatch_table; - using impl::bitwise_right_shift_output_id_table; - using impl::bitwise_right_shift_strided_dispatch_table; - - auto bitwise_right_shift_pyapi = [&](const dpctl::tensor::usm_ndarray - &src1, - const dpctl::tensor::usm_ndarray - &src2, - const dpctl::tensor::usm_ndarray - &dst, - sycl::queue &exec_q, - const std::vector - &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, - bitwise_right_shift_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - bitwise_right_shift_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - bitwise_right_shift_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto bitwise_right_shift_result_type_pyapi = - [&](const py::dtype &dtype1, const py::dtype &dtype2) { - return py_binary_ufunc_result_type( - dtype1, dtype2, bitwise_right_shift_output_id_table); - }; - m.def("_bitwise_right_shift", bitwise_right_shift_pyapi, "", - py::arg("src1"), py::arg("src2"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_bitwise_right_shift_result_type", - bitwise_right_shift_result_type_pyapi, ""); - } - - // B07: ===== BITWISE_XOR (x1, x2) - { - impl::populate_bitwise_xor_dispatch_tables(); - using impl::bitwise_xor_contig_dispatch_table; - using impl::bitwise_xor_output_id_table; - using impl::bitwise_xor_strided_dispatch_table; - - auto bitwise_xor_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, bitwise_xor_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - bitwise_xor_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - bitwise_xor_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto bitwise_xor_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - bitwise_xor_output_id_table); - }; - m.def("_bitwise_xor", bitwise_xor_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_bitwise_xor_result_type", bitwise_xor_result_type_pyapi, ""); - } - - // U09: ==== CEIL (x) - { - impl::populate_ceil_dispatch_vectors(); - using impl::ceil_contig_dispatch_vector; - using impl::ceil_output_typeid_vector; - using impl::ceil_strided_dispatch_vector; - - auto ceil_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, ceil_output_typeid_vector, - ceil_contig_dispatch_vector, ceil_strided_dispatch_vector); - }; - m.def("_ceil", ceil_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto ceil_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, ceil_output_typeid_vector); - }; - m.def("_ceil_result_type", ceil_result_type_pyapi); - } - - // U10: ==== CONJ (x) - { - impl::populate_conj_dispatch_vectors(); - using impl::conj_contig_dispatch_vector; - using impl::conj_output_typeid_vector; - using impl::conj_strided_dispatch_vector; - - auto conj_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, conj_output_typeid_vector, - conj_contig_dispatch_vector, conj_strided_dispatch_vector); - }; - m.def("_conj", conj_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto conj_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, conj_output_typeid_vector); - }; - m.def("_conj_result_type", conj_result_type_pyapi); - } - - // U11: ==== COS (x) - { - impl::populate_cos_dispatch_vectors(); - using impl::cos_contig_dispatch_vector; - using impl::cos_output_typeid_vector; - using impl::cos_strided_dispatch_vector; - - auto cos_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, cos_output_typeid_vector, - cos_contig_dispatch_vector, cos_strided_dispatch_vector); - }; - m.def("_cos", cos_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto cos_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, cos_output_typeid_vector); - }; - m.def("_cos_result_type", cos_result_type_pyapi); - } - - // U12: ==== COSH (x) - { - impl::populate_cosh_dispatch_vectors(); - using impl::cosh_contig_dispatch_vector; - using impl::cosh_output_typeid_vector; - using impl::cosh_strided_dispatch_vector; - - auto cosh_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, cosh_output_typeid_vector, - cosh_contig_dispatch_vector, cosh_strided_dispatch_vector); - }; - m.def("_cosh", cosh_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto cosh_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, cosh_output_typeid_vector); - }; - m.def("_cosh_result_type", cosh_result_type_pyapi); - } - - // B08: ==== DIVIDE (x1, x2) - { - impl::populate_true_divide_dispatch_tables(); - using impl::true_divide_contig_dispatch_table; - using impl:: - true_divide_contig_matrix_contig_row_broadcast_dispatch_table; - using impl:: - true_divide_contig_row_contig_matrix_broadcast_dispatch_table; - using impl::true_divide_output_id_table; - using impl::true_divide_strided_dispatch_table; - - auto divide_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, true_divide_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - true_divide_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - true_divide_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - true_divide_contig_matrix_contig_row_broadcast_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - true_divide_contig_row_contig_matrix_broadcast_dispatch_table); - }; - auto divide_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - true_divide_output_id_table); - }; - m.def("_divide", divide_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_divide_result_type", divide_result_type_pyapi, ""); - } - - // B09: ==== EQUAL (x1, x2) - { - impl::populate_equal_dispatch_tables(); - using impl::equal_contig_dispatch_table; - using impl::equal_output_id_table; - using impl::equal_strided_dispatch_table; - - auto equal_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, equal_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - equal_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - equal_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto equal_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - equal_output_id_table); - }; - m.def("_equal", equal_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_equal_result_type", equal_result_type_pyapi, ""); - } - - // U13: ==== EXP (x) - { - impl::populate_exp_dispatch_vectors(); - using impl::exp_contig_dispatch_vector; - using impl::exp_output_typeid_vector; - using impl::exp_strided_dispatch_vector; - - auto exp_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, exp_output_typeid_vector, - exp_contig_dispatch_vector, exp_strided_dispatch_vector); - }; - m.def("_exp", exp_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto exp_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, exp_output_typeid_vector); - }; - m.def("_exp_result_type", exp_result_type_pyapi); - } - - // U14: ==== EXPM1 (x) - { - impl::populate_expm1_dispatch_vectors(); - using impl::expm1_contig_dispatch_vector; - using impl::expm1_output_typeid_vector; - using impl::expm1_strided_dispatch_vector; - - auto expm1_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, expm1_output_typeid_vector, - expm1_contig_dispatch_vector, expm1_strided_dispatch_vector); - }; - m.def("_expm1", expm1_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto expm1_result_type_pyapi = [&](const py::dtype dtype) { - return py_unary_ufunc_result_type(dtype, - expm1_output_typeid_vector); - }; - m.def("_expm1_result_type", expm1_result_type_pyapi); - } - - // U15: ==== FLOOR (x) - { - impl::populate_floor_dispatch_vectors(); - using impl::floor_contig_dispatch_vector; - using impl::floor_output_typeid_vector; - using impl::floor_strided_dispatch_vector; - - auto floor_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, floor_output_typeid_vector, - floor_contig_dispatch_vector, floor_strided_dispatch_vector); - }; - m.def("_floor", floor_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto floor_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - floor_output_typeid_vector); - }; - m.def("_floor_result_type", floor_result_type_pyapi); - } - - // B10: ==== FLOOR_DIVIDE (x1, x2) - { - impl::populate_floor_divide_dispatch_tables(); - using impl::floor_divide_contig_dispatch_table; - using impl::floor_divide_output_id_table; - using impl::floor_divide_strided_dispatch_table; - - auto floor_divide_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, floor_divide_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - floor_divide_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - floor_divide_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto floor_divide_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - floor_divide_output_id_table); - }; - m.def("_floor_divide", floor_divide_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_floor_divide_result_type", floor_divide_result_type_pyapi, ""); - } - - // B11: ==== GREATER (x1, x2) - { - impl::populate_greater_dispatch_tables(); - using impl::greater_contig_dispatch_table; - using impl::greater_output_id_table; - using impl::greater_strided_dispatch_table; - - auto greater_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, greater_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - greater_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - greater_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto greater_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - greater_output_id_table); - }; - m.def("_greater", greater_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_greater_result_type", greater_result_type_pyapi, ""); - } - - // B12: ==== GREATER_EQUAL (x1, x2) - { - impl::populate_greater_equal_dispatch_tables(); - using impl::greater_equal_contig_dispatch_table; - using impl::greater_equal_output_id_table; - using impl::greater_equal_strided_dispatch_table; - - auto greater_equal_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, greater_equal_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - greater_equal_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - greater_equal_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto greater_equal_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - greater_equal_output_id_table); - }; - m.def("_greater_equal", greater_equal_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_greater_equal_result_type", greater_equal_result_type_pyapi, - ""); - } - - // U16: ==== IMAG (x) - { - impl::populate_imag_dispatch_vectors(); - using impl::imag_contig_dispatch_vector; - using impl::imag_output_typeid_vector; - using impl::imag_strided_dispatch_vector; - - auto imag_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, imag_output_typeid_vector, - imag_contig_dispatch_vector, imag_strided_dispatch_vector); - }; - m.def("_imag", imag_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto imag_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, imag_output_typeid_vector); - }; - m.def("_imag_result_type", imag_result_type_pyapi); - } - - // U17: ==== ISFINITE (x) - { - impl::populate_isfinite_dispatch_vectors(); - - using impl::isfinite_contig_dispatch_vector; - using impl::isfinite_output_typeid_vector; - using impl::isfinite_strided_dispatch_vector; - auto isfinite_pyapi = - [&](const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_unary_ufunc(src, dst, exec_q, depends, - isfinite_output_typeid_vector, - isfinite_contig_dispatch_vector, - isfinite_strided_dispatch_vector); - }; - auto isfinite_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - isfinite_output_typeid_vector); - }; - m.def("_isfinite", isfinite_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_isfinite_result_type", isfinite_result_type_pyapi, ""); - } - - // U18: ==== ISINF (x) - { - impl::populate_isinf_dispatch_vectors(); - - using impl::isinf_contig_dispatch_vector; - using impl::isinf_output_typeid_vector; - using impl::isinf_strided_dispatch_vector; - auto isinf_pyapi = [&](const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, isinf_output_typeid_vector, - isinf_contig_dispatch_vector, isinf_strided_dispatch_vector); - }; - auto isinf_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - isinf_output_typeid_vector); - }; - m.def("_isinf", isinf_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_isinf_result_type", isinf_result_type_pyapi, ""); - } - - // U19: ==== ISNAN (x) - { - impl::populate_isnan_dispatch_vectors(); - - using impl::isnan_contig_dispatch_vector; - using impl::isnan_output_typeid_vector; - using impl::isnan_strided_dispatch_vector; - auto isnan_pyapi = [&](const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, isnan_output_typeid_vector, - isnan_contig_dispatch_vector, isnan_strided_dispatch_vector); - }; - auto isnan_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - isnan_output_typeid_vector); - }; - m.def("_isnan", isnan_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_isnan_result_type", isnan_result_type_pyapi, ""); - } - - // B13: ==== LESS (x1, x2) - { - impl::populate_less_dispatch_tables(); - using impl::less_contig_dispatch_table; - using impl::less_output_id_table; - using impl::less_strided_dispatch_table; - - auto less_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, less_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - less_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - less_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto less_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - less_output_id_table); - }; - m.def("_less", less_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_less_result_type", less_result_type_pyapi, ""); - } - - // B14: ==== LESS_EQUAL (x1, x2) - { - impl::populate_less_equal_dispatch_tables(); - using impl::less_equal_contig_dispatch_table; - using impl::less_equal_output_id_table; - using impl::less_equal_strided_dispatch_table; - - auto less_equal_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, less_equal_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - less_equal_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - less_equal_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto less_equal_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - less_equal_output_id_table); - }; - m.def("_less_equal", less_equal_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_less_equal_result_type", less_equal_result_type_pyapi, ""); - } - - // U20: ==== LOG (x) - { - impl::populate_log_dispatch_vectors(); - using impl::log_contig_dispatch_vector; - using impl::log_output_typeid_vector; - using impl::log_strided_dispatch_vector; - - auto log_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, log_output_typeid_vector, - log_contig_dispatch_vector, log_strided_dispatch_vector); - }; - m.def("_log", log_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto log_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, log_output_typeid_vector); - }; - m.def("_log_result_type", log_result_type_pyapi); - } - - // U21: ==== LOG1P (x) - { - impl::populate_log1p_dispatch_vectors(); - using impl::log1p_contig_dispatch_vector; - using impl::log1p_output_typeid_vector; - using impl::log1p_strided_dispatch_vector; - - auto log1p_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, log1p_output_typeid_vector, - log1p_contig_dispatch_vector, log1p_strided_dispatch_vector); - }; - m.def("_log1p", log1p_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto log1p_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - log1p_output_typeid_vector); - }; - m.def("_log1p_result_type", log1p_result_type_pyapi); - } - - // U22: ==== LOG2 (x) - { - impl::populate_log2_dispatch_vectors(); - - using impl::log2_contig_dispatch_vector; - using impl::log2_output_typeid_vector; - using impl::log2_strided_dispatch_vector; - auto log2_pyapi = [&](const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, log2_output_typeid_vector, - log2_contig_dispatch_vector, log2_strided_dispatch_vector); - }; - auto log2_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, log2_output_typeid_vector); - }; - m.def("_log2", log2_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_log2_result_type", log2_result_type_pyapi, ""); - } - - // U23: ==== LOG10 (x) - { - impl::populate_log10_dispatch_vectors(); - - using impl::log10_contig_dispatch_vector; - using impl::log10_output_typeid_vector; - using impl::log10_strided_dispatch_vector; - auto log10_pyapi = [&](const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, log10_output_typeid_vector, - log10_contig_dispatch_vector, log10_strided_dispatch_vector); - }; - auto log10_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - log10_output_typeid_vector); - }; - m.def("_log10", log10_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_log10_result_type", log10_result_type_pyapi, ""); - } - - // B15: ==== LOGADDEXP (x1, x2) - { - impl::populate_logaddexp_dispatch_tables(); - using impl::logaddexp_contig_dispatch_table; - using impl::logaddexp_output_id_table; - using impl::logaddexp_strided_dispatch_table; - - auto logaddexp_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, logaddexp_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - logaddexp_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - logaddexp_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto logaddexp_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - logaddexp_output_id_table); - }; - m.def("_logaddexp", logaddexp_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_logaddexp_result_type", logaddexp_result_type_pyapi, ""); - } - - // B16: ==== LOGICAL_AND (x1, x2) - { - impl::populate_logical_and_dispatch_tables(); - using impl::logical_and_contig_dispatch_table; - using impl::logical_and_output_id_table; - using impl::logical_and_strided_dispatch_table; - - auto logical_and_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, logical_and_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - logical_and_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - logical_and_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto logical_and_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - logical_and_output_id_table); - }; - m.def("_logical_and", logical_and_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_logical_and_result_type", logical_and_result_type_pyapi, ""); - } - - // U24: ==== LOGICAL_NOT (x) - { - impl::populate_logical_not_dispatch_vectors(); - using impl::logical_not_contig_dispatch_vector; - using impl::logical_not_output_typeid_vector; - using impl::logical_not_strided_dispatch_vector; - - auto logical_not_pyapi = [&](const arrayT &src, arrayT dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc(src, dst, exec_q, depends, - logical_not_output_typeid_vector, - logical_not_contig_dispatch_vector, - logical_not_strided_dispatch_vector); - }; - m.def("_logical_not", logical_not_pyapi, "", py::arg("src"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - - auto logical_not_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - logical_not_output_typeid_vector); - }; - m.def("_logical_not_result_type", logical_not_result_type_pyapi); - } - - // B17: ==== LOGICAL_OR (x1, x2) - { - impl::populate_logical_or_dispatch_tables(); - using impl::logical_or_contig_dispatch_table; - using impl::logical_or_output_id_table; - using impl::logical_or_strided_dispatch_table; - - auto logical_or_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, logical_or_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - logical_or_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - logical_or_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto logical_or_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - logical_or_output_id_table); - }; - m.def("_logical_or", logical_or_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_logical_or_result_type", logical_or_result_type_pyapi, ""); - } - - // B18: ==== LOGICAL_XOR (x1, x2) - { - impl::populate_logical_xor_dispatch_tables(); - using impl::logical_xor_contig_dispatch_table; - using impl::logical_xor_output_id_table; - using impl::logical_xor_strided_dispatch_table; - - auto logical_xor_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, logical_xor_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - logical_xor_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - logical_xor_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto logical_xor_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - logical_xor_output_id_table); - }; - m.def("_logical_xor", logical_xor_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_logical_xor_result_type", logical_xor_result_type_pyapi, ""); - } - - // B??: ==== MAXIMUM (x1, x2) - { - impl::populate_maximum_dispatch_tables(); - using impl::maximum_contig_dispatch_table; - using impl::maximum_output_id_table; - using impl::maximum_strided_dispatch_table; - - auto maximum_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, maximum_output_id_table, - // function pointers to handle operation on contiguous - // arrays (pointers may be nullptr) - maximum_contig_dispatch_table, - // function pointers to handle operation on strided arrays - // (most general case) - maximum_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix - // and c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix - // and c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto maximum_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - maximum_output_id_table); - }; - m.def("_maximum", maximum_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_maximum_result_type", maximum_result_type_pyapi, ""); - } - - // B??: ==== MINIMUM (x1, x2) - { - impl::populate_minimum_dispatch_tables(); - using impl::minimum_contig_dispatch_table; - using impl::minimum_output_id_table; - using impl::minimum_strided_dispatch_table; - - auto minimum_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, minimum_output_id_table, - // function pointers to handle operation on contiguous - // arrays (pointers may be nullptr) - minimum_contig_dispatch_table, - // function pointers to handle operation on strided arrays - // (most general case) - minimum_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix - // and c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto minimum_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - minimum_output_id_table); - }; - m.def("_minimum", minimum_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_minimum_result_type", minimum_result_type_pyapi, ""); - } - - // B19: ==== MULTIPLY (x1, x2) - { - impl::populate_multiply_dispatch_tables(); - using impl::multiply_contig_dispatch_table; - using impl::multiply_contig_matrix_contig_row_broadcast_dispatch_table; - using impl::multiply_contig_row_contig_matrix_broadcast_dispatch_table; - using impl::multiply_output_id_table; - using impl::multiply_strided_dispatch_table; - - auto multiply_pyapi = - [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, multiply_output_id_table, - // function pointers to handle operation on contiguous - // arrays (pointers may be nullptr) - multiply_contig_dispatch_table, - // function pointers to handle operation on strided arrays - // (most general case) - multiply_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix - // and c-contig row with broadcasting (may be nullptr) - multiply_contig_matrix_contig_row_broadcast_dispatch_table, - // function pointers to handle operation of c-contig matrix - // and c-contig row with broadcasting (may be nullptr) - multiply_contig_row_contig_matrix_broadcast_dispatch_table); - }; - auto multiply_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - multiply_output_id_table); - }; - m.def("_multiply", multiply_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_multiply_result_type", multiply_result_type_pyapi, ""); - - using impl::multiply_inplace_contig_dispatch_table; - using impl::multiply_inplace_row_matrix_dispatch_table; - using impl::multiply_inplace_strided_dispatch_table; - - auto multiply_inplace_pyapi = - [&](const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_inplace_ufunc( - src, dst, exec_q, depends, multiply_output_id_table, - // function pointers to handle inplace operation on - // contiguous arrays (pointers may be nullptr) - multiply_inplace_contig_dispatch_table, - // function pointers to handle inplace operation on strided - // arrays (most general case) - multiply_inplace_strided_dispatch_table, - // function pointers to handle inplace operation on - // c-contig matrix with c-contig row with broadcasting - // (may be nullptr) - multiply_inplace_row_matrix_dispatch_table); - }; - m.def("_multiply_inplace", multiply_inplace_pyapi, "", py::arg("lhs"), - py::arg("rhs"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - } - - // U25: ==== NEGATIVE (x) - { - impl::populate_negative_dispatch_vectors(); - using impl::negative_contig_dispatch_vector; - using impl::negative_output_typeid_vector; - using impl::negative_strided_dispatch_vector; - - auto negative_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc(src, dst, exec_q, depends, - negative_output_typeid_vector, - negative_contig_dispatch_vector, - negative_strided_dispatch_vector); - }; - m.def("_negative", negative_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto negative_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - negative_output_typeid_vector); - }; - m.def("_negative_result_type", negative_result_type_pyapi); - } - - // B20: ==== NOT_EQUAL (x1, x2) - { - impl::populate_not_equal_dispatch_tables(); - using impl::not_equal_contig_dispatch_table; - using impl::not_equal_output_id_table; - using impl::not_equal_strided_dispatch_table; - - auto not_equal_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, not_equal_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - not_equal_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - not_equal_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto not_equal_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - not_equal_output_id_table); - }; - m.def("_not_equal", not_equal_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_not_equal_result_type", not_equal_result_type_pyapi, ""); - } - - // U26: ==== POSITIVE (x) - { - impl::populate_positive_dispatch_vectors(); - using impl::positive_contig_dispatch_vector; - using impl::positive_output_typeid_vector; - using impl::positive_strided_dispatch_vector; - - auto positive_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc(src, dst, exec_q, depends, - positive_output_typeid_vector, - positive_contig_dispatch_vector, - positive_strided_dispatch_vector); - }; - m.def("_positive", positive_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto positive_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - positive_output_typeid_vector); - }; - m.def("_positive_result_type", positive_result_type_pyapi); - } - - // B21: ==== POW (x1, x2) - { - impl::populate_pow_dispatch_tables(); - using impl::pow_contig_dispatch_table; - using impl::pow_output_id_table; - using impl::pow_strided_dispatch_table; - - auto pow_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, pow_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - pow_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - pow_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto pow_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - pow_output_id_table); - }; - m.def("_pow", pow_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_pow_result_type", pow_result_type_pyapi, ""); - } - - // U??: ==== PROJ (x) - { - impl::populate_proj_dispatch_vectors(); - using impl::proj_contig_dispatch_vector; - using impl::proj_output_typeid_vector; - using impl::proj_strided_dispatch_vector; - - auto proj_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, proj_output_typeid_vector, - proj_contig_dispatch_vector, proj_strided_dispatch_vector); - }; - m.def("_proj", proj_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto proj_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, proj_output_typeid_vector); - }; - m.def("_proj_result_type", proj_result_type_pyapi); - } - - // U27: ==== REAL (x) - { - impl::populate_real_dispatch_vectors(); - using impl::real_contig_dispatch_vector; - using impl::real_output_typeid_vector; - using impl::real_strided_dispatch_vector; - - auto real_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, real_output_typeid_vector, - real_contig_dispatch_vector, real_strided_dispatch_vector); - }; - m.def("_real", real_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto real_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, real_output_typeid_vector); - }; - m.def("_real_result_type", real_result_type_pyapi); - } - - // B22: ==== REMAINDER (x1, x2) - { - impl::populate_remainder_dispatch_tables(); - using impl::remainder_contig_dispatch_table; - using impl::remainder_output_id_table; - using impl::remainder_strided_dispatch_table; - - auto remainder_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, remainder_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - remainder_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - remainder_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto remainder_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - remainder_output_id_table); - }; - m.def("_remainder", remainder_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_remainder_result_type", remainder_result_type_pyapi, ""); - } - - // U28: ==== ROUND (x) - { - impl::populate_round_dispatch_vectors(); - using impl::round_contig_dispatch_vector; - using impl::round_output_typeid_vector; - using impl::round_strided_dispatch_vector; - - auto round_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, round_output_typeid_vector, - round_contig_dispatch_vector, round_strided_dispatch_vector); - }; - m.def("_round", round_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto round_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - round_output_typeid_vector); - }; - m.def("_round_result_type", round_result_type_pyapi); - } - - // U29: ==== SIGN (x) - { - impl::populate_sign_dispatch_vectors(); - using impl::sign_contig_dispatch_vector; - using impl::sign_output_typeid_vector; - using impl::sign_strided_dispatch_vector; - - auto sign_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, sign_output_typeid_vector, - sign_contig_dispatch_vector, sign_strided_dispatch_vector); - }; - m.def("_sign", sign_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto sign_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, sign_output_typeid_vector); - }; - m.def("_sign_result_type", sign_result_type_pyapi); - } - - // ==== SIGNBIT (x) - { - impl::populate_signbit_dispatch_vectors(); - using impl::signbit_contig_dispatch_vector; - using impl::signbit_output_typeid_vector; - using impl::signbit_strided_dispatch_vector; - - auto signbit_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc(src, dst, exec_q, depends, - signbit_output_typeid_vector, - signbit_contig_dispatch_vector, - signbit_strided_dispatch_vector); - }; - m.def("_signbit", signbit_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto signbit_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - signbit_output_typeid_vector); - }; - m.def("_signbit_result_type", signbit_result_type_pyapi); - } - - // U30: ==== SIN (x) - { - impl::populate_sin_dispatch_vectors(); - using impl::sin_contig_dispatch_vector; - using impl::sin_output_typeid_vector; - using impl::sin_strided_dispatch_vector; - - auto sin_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, sin_output_typeid_vector, - sin_contig_dispatch_vector, sin_strided_dispatch_vector); - }; - m.def("_sin", sin_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto sin_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, sin_output_typeid_vector); - }; - m.def("_sin_result_type", sin_result_type_pyapi); - } - // U31: ==== SINH (x) - { - impl::populate_sinh_dispatch_vectors(); - using impl::sinh_contig_dispatch_vector; - using impl::sinh_output_typeid_vector; - using impl::sinh_strided_dispatch_vector; - - auto sinh_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, sinh_output_typeid_vector, - sinh_contig_dispatch_vector, sinh_strided_dispatch_vector); - }; - m.def("_sinh", sinh_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto sinh_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, sinh_output_typeid_vector); - }; - m.def("_sinh_result_type", sinh_result_type_pyapi); - } - - // U32: ==== SQUARE (x) - { - impl::populate_square_dispatch_vectors(); - using impl::square_contig_dispatch_vector; - using impl::square_output_typeid_vector; - using impl::square_strided_dispatch_vector; - - auto square_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, square_output_typeid_vector, - square_contig_dispatch_vector, square_strided_dispatch_vector); - }; - m.def("_square", square_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto square_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - square_output_typeid_vector); - }; - m.def("_square_result_type", square_result_type_pyapi); - } - - // U33: ==== SQRT (x) - { - impl::populate_sqrt_dispatch_vectors(); - using impl::sqrt_contig_dispatch_vector; - using impl::sqrt_output_typeid_vector; - using impl::sqrt_strided_dispatch_vector; - - auto sqrt_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, sqrt_output_typeid_vector, - sqrt_contig_dispatch_vector, sqrt_strided_dispatch_vector); - }; - m.def("_sqrt", sqrt_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto sqrt_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, sqrt_output_typeid_vector); - }; - m.def("_sqrt_result_type", sqrt_result_type_pyapi); - } - - // B23: ==== SUBTRACT (x1, x2) - { - impl::populate_subtract_dispatch_tables(); - using impl::subtract_contig_dispatch_table; - using impl::subtract_contig_matrix_contig_row_broadcast_dispatch_table; - using impl::subtract_contig_row_contig_matrix_broadcast_dispatch_table; - using impl::subtract_output_id_table; - using impl::subtract_strided_dispatch_table; - - auto subtract_pyapi = - [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, subtract_output_id_table, - // function pointers to handle operation on contiguous - // arrays (pointers may be nullptr) - subtract_contig_dispatch_table, - // function pointers to handle operation on strided arrays - // (most general case) - subtract_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix - // and c-contig row with broadcasting (may be nullptr) - subtract_contig_matrix_contig_row_broadcast_dispatch_table, - // function pointers to handle operation of c-contig matrix - // and c-contig row with broadcasting (may be nullptr) - subtract_contig_row_contig_matrix_broadcast_dispatch_table); - }; - auto subtract_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - subtract_output_id_table); - }; - m.def("_subtract", subtract_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_subtract_result_type", subtract_result_type_pyapi, ""); - - using impl::subtract_inplace_contig_dispatch_table; - using impl::subtract_inplace_row_matrix_dispatch_table; - using impl::subtract_inplace_strided_dispatch_table; - - auto subtract_inplace_pyapi = - [&](const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_inplace_ufunc( - src, dst, exec_q, depends, subtract_output_id_table, - // function pointers to handle inplace operation on - // contiguous arrays (pointers may be nullptr) - subtract_inplace_contig_dispatch_table, - // function pointers to handle inplace operation on strided - // arrays (most general case) - subtract_inplace_strided_dispatch_table, - // function pointers to handle inplace operation on - // c-contig matrix with c-contig row with broadcasting - // (may be nullptr) - subtract_inplace_row_matrix_dispatch_table); - }; - m.def("_subtract_inplace", subtract_inplace_pyapi, "", py::arg("lhs"), - py::arg("rhs"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - } - - // U34: ==== TAN (x) - { - impl::populate_tan_dispatch_vectors(); - using impl::tan_contig_dispatch_vector; - using impl::tan_output_typeid_vector; - using impl::tan_strided_dispatch_vector; - - auto tan_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, tan_output_typeid_vector, - tan_contig_dispatch_vector, tan_strided_dispatch_vector); - }; - m.def("_tan", tan_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto tan_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, tan_output_typeid_vector); - }; - m.def("_tan_result_type", tan_result_type_pyapi); - } - - // U35: ==== TANH (x) - { - impl::populate_tanh_dispatch_vectors(); - using impl::tanh_contig_dispatch_vector; - using impl::tanh_output_typeid_vector; - using impl::tanh_strided_dispatch_vector; - - auto tanh_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, tanh_output_typeid_vector, - tanh_contig_dispatch_vector, tanh_strided_dispatch_vector); - }; - m.def("_tanh", tanh_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto tanh_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, tanh_output_typeid_vector); - }; - m.def("_tanh_result_type", tanh_result_type_pyapi); - } - - // U36: ==== TRUNC (x) - { - impl::populate_trunc_dispatch_vectors(); - using impl::trunc_contig_dispatch_vector; - using impl::trunc_output_typeid_vector; - using impl::trunc_strided_dispatch_vector; - - auto trunc_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, trunc_output_typeid_vector, - trunc_contig_dispatch_vector, trunc_strided_dispatch_vector); - }; - m.def("_trunc", trunc_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto trunc_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - trunc_output_typeid_vector); - }; - m.def("_trunc_result_type", trunc_result_type_pyapi); - } - - // B24: ==== HYPOT (x1, x2) - { - impl::populate_hypot_dispatch_tables(); - using impl::hypot_contig_dispatch_table; - using impl::hypot_output_id_table; - using impl::hypot_strided_dispatch_table; - - auto hypot_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, hypot_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - hypot_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - hypot_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto hypot_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - hypot_output_id_table); - }; - m.def("_hypot", hypot_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_hypot_result_type", hypot_result_type_pyapi, ""); - } -} - -} // namespace py_internal -} // namespace tensor -} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/abs.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/abs.cpp new file mode 100644 index 0000000000..4b3e8b635b --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/abs.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "abs.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/abs.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U01: ==== ABS (x) +namespace impl +{ + +namespace abs_fn_ns = dpctl::tensor::kernels::abs; + +static unary_contig_impl_fn_ptr_t abs_contig_dispatch_vector[td_ns::num_types]; +static int abs_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + abs_strided_dispatch_vector[td_ns::num_types]; + +void populate_abs_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = abs_fn_ns; + + using fn_ns::AbsContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(abs_contig_dispatch_vector); + + using fn_ns::AbsStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(abs_strided_dispatch_vector); + + using fn_ns::AbsTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(abs_output_typeid_vector); +}; + +} // namespace impl + +void init_abs(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_abs_dispatch_vectors(); + using impl::abs_contig_dispatch_vector; + using impl::abs_output_typeid_vector; + using impl::abs_strided_dispatch_vector; + + auto abs_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, abs_output_typeid_vector, + abs_contig_dispatch_vector, abs_strided_dispatch_vector); + }; + m.def("_abs", abs_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto abs_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, abs_output_typeid_vector); + }; + m.def("_abs_result_type", abs_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/abs.hpp new file mode 100644 index 0000000000..d09eafc6bd --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/abs.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_abs(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/acos.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/acos.cpp new file mode 100644 index 0000000000..011cc052fb --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/acos.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "acos.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/acos.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U02: ==== ACOS (x) +namespace impl +{ + +namespace acos_fn_ns = dpctl::tensor::kernels::acos; + +static unary_contig_impl_fn_ptr_t acos_contig_dispatch_vector[td_ns::num_types]; +static int acos_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + acos_strided_dispatch_vector[td_ns::num_types]; + +void populate_acos_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = acos_fn_ns; + + using fn_ns::AcosContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(acos_contig_dispatch_vector); + + using fn_ns::AcosStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(acos_strided_dispatch_vector); + + using fn_ns::AcosTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(acos_output_typeid_vector); +}; + +} // namespace impl + +void init_acos(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_acos_dispatch_vectors(); + using impl::acos_contig_dispatch_vector; + using impl::acos_output_typeid_vector; + using impl::acos_strided_dispatch_vector; + + auto acos_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, acos_output_typeid_vector, + acos_contig_dispatch_vector, acos_strided_dispatch_vector); + }; + m.def("_acos", acos_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto acos_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, acos_output_typeid_vector); + }; + m.def("_acos_result_type", acos_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/acos.hpp new file mode 100644 index 0000000000..3a43d4087c --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/acos.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_acos(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/acosh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/acosh.cpp new file mode 100644 index 0000000000..526bd44f12 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/acosh.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "acosh.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/acosh.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U03: ==== ACOSH (x) +namespace impl +{ + +namespace acosh_fn_ns = dpctl::tensor::kernels::acosh; + +static unary_contig_impl_fn_ptr_t + acosh_contig_dispatch_vector[td_ns::num_types]; +static int acosh_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + acosh_strided_dispatch_vector[td_ns::num_types]; + +void populate_acosh_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = acosh_fn_ns; + + using fn_ns::AcoshContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(acosh_contig_dispatch_vector); + + using fn_ns::AcoshStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(acosh_strided_dispatch_vector); + + using fn_ns::AcoshTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(acosh_output_typeid_vector); +}; + +} // namespace impl + +void init_acosh(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_acosh_dispatch_vectors(); + using impl::acosh_contig_dispatch_vector; + using impl::acosh_output_typeid_vector; + using impl::acosh_strided_dispatch_vector; + + auto acosh_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, acosh_output_typeid_vector, + acosh_contig_dispatch_vector, acosh_strided_dispatch_vector); + }; + m.def("_acosh", acosh_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto acosh_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + acosh_output_typeid_vector); + }; + m.def("_acosh_result_type", acosh_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/acosh.hpp new file mode 100644 index 0000000000..dd13ba886c --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/acosh.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_acosh(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp new file mode 100644 index 0000000000..247b8e0283 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp @@ -0,0 +1,229 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "add.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/add.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B01: ===== ADD (x1, x2) +namespace impl +{ + +namespace add_fn_ns = dpctl::tensor::kernels::add; + +static binary_contig_impl_fn_ptr_t add_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static int add_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + add_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +// add(matrix, row) +static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t + add_contig_matrix_contig_row_broadcast_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +// add(row, matrix) +static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t + add_contig_row_contig_matrix_broadcast_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + add_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + add_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t + add_inplace_row_matrix_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_add_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = add_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::AddTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(add_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::AddStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(add_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::AddContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(add_contig_dispatch_table); + + // function pointers for operation on contiguous matrix, contiguous row + // with contiguous matrix output + using fn_ns::AddContigMatrixContigRowBroadcastFactory; + DispatchTableBuilder< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, + AddContigMatrixContigRowBroadcastFactory, num_types> + dtb4; + dtb4.populate_dispatch_table( + add_contig_matrix_contig_row_broadcast_dispatch_table); + + // function pointers for operation on contiguous row, contiguous matrix + // with contiguous matrix output + using fn_ns::AddContigRowContigMatrixBroadcastFactory; + DispatchTableBuilder< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, + AddContigRowContigMatrixBroadcastFactory, num_types> + dtb5; + dtb5.populate_dispatch_table( + add_contig_row_contig_matrix_broadcast_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::AddInplaceStridedFactory; + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(add_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::AddInplaceContigFactory; + DispatchTableBuilder + dtb7; + dtb7.populate_dispatch_table(add_inplace_contig_dispatch_table); + + // function pointers for inplace operation on contiguous matrix + // and contiguous row + using fn_ns::AddInplaceRowMatrixBroadcastFactory; + DispatchTableBuilder + dtb8; + dtb8.populate_dispatch_table(add_inplace_row_matrix_dispatch_table); +}; + +} // namespace impl + +void init_add(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_add_dispatch_tables(); + using impl::add_contig_dispatch_table; + using impl::add_contig_matrix_contig_row_broadcast_dispatch_table; + using impl::add_contig_row_contig_matrix_broadcast_dispatch_table; + using impl::add_output_id_table; + using impl::add_strided_dispatch_table; + + auto add_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, add_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + add_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + add_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + add_contig_matrix_contig_row_broadcast_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + add_contig_row_contig_matrix_broadcast_dispatch_table); + }; + auto add_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + add_output_id_table); + }; + m.def("_add", add_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_add_result_type", add_result_type_pyapi, ""); + + using impl::add_inplace_contig_dispatch_table; + using impl::add_inplace_row_matrix_dispatch_table; + using impl::add_inplace_strided_dispatch_table; + + auto add_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, add_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + add_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + add_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + add_inplace_row_matrix_dispatch_table); + }; + m.def("_add_inplace", add_inplace_pyapi, "", py::arg("lhs"), + py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/add.hpp new file mode 100644 index 0000000000..5f88bfaa04 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/add.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_add(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/asin.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/asin.cpp new file mode 100644 index 0000000000..14ef5e2665 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/asin.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "asin.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/asin.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U04: ==== ASIN (x) +namespace impl +{ + +namespace asin_fn_ns = dpctl::tensor::kernels::asin; + +static unary_contig_impl_fn_ptr_t asin_contig_dispatch_vector[td_ns::num_types]; +static int asin_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + asin_strided_dispatch_vector[td_ns::num_types]; + +void populate_asin_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = asin_fn_ns; + + using fn_ns::AsinContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(asin_contig_dispatch_vector); + + using fn_ns::AsinStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(asin_strided_dispatch_vector); + + using fn_ns::AsinTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(asin_output_typeid_vector); +}; + +} // namespace impl + +void init_asin(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_asin_dispatch_vectors(); + using impl::asin_contig_dispatch_vector; + using impl::asin_output_typeid_vector; + using impl::asin_strided_dispatch_vector; + + auto asin_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, asin_output_typeid_vector, + asin_contig_dispatch_vector, asin_strided_dispatch_vector); + }; + m.def("_asin", asin_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto asin_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, asin_output_typeid_vector); + }; + m.def("_asin_result_type", asin_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/asin.hpp new file mode 100644 index 0000000000..0beed1d19c --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/asin.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_asin(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/asinh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/asinh.cpp new file mode 100644 index 0000000000..dd0b4e62f7 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/asinh.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "asinh.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/asinh.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U05: ==== ASINH (x) +namespace impl +{ + +namespace asinh_fn_ns = dpctl::tensor::kernels::asinh; + +static unary_contig_impl_fn_ptr_t + asinh_contig_dispatch_vector[td_ns::num_types]; +static int asinh_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + asinh_strided_dispatch_vector[td_ns::num_types]; + +void populate_asinh_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = asinh_fn_ns; + + using fn_ns::AsinhContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(asinh_contig_dispatch_vector); + + using fn_ns::AsinhStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(asinh_strided_dispatch_vector); + + using fn_ns::AsinhTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(asinh_output_typeid_vector); +}; + +} // namespace impl + +void init_asinh(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_asinh_dispatch_vectors(); + using impl::asinh_contig_dispatch_vector; + using impl::asinh_output_typeid_vector; + using impl::asinh_strided_dispatch_vector; + + auto asinh_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, asinh_output_typeid_vector, + asinh_contig_dispatch_vector, asinh_strided_dispatch_vector); + }; + m.def("_asinh", asinh_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto asinh_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + asinh_output_typeid_vector); + }; + m.def("_asinh_result_type", asinh_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/asinh.hpp new file mode 100644 index 0000000000..22cc37b2d8 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/asinh.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_asinh(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atan.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/atan.cpp new file mode 100644 index 0000000000..81ff00c46a --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/atan.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "atan.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/atan.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U06: ==== ATAN (x) +namespace impl +{ + +namespace atan_fn_ns = dpctl::tensor::kernels::atan; + +static unary_contig_impl_fn_ptr_t atan_contig_dispatch_vector[td_ns::num_types]; +static int atan_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + atan_strided_dispatch_vector[td_ns::num_types]; + +void populate_atan_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = atan_fn_ns; + + using fn_ns::AtanContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(atan_contig_dispatch_vector); + + using fn_ns::AtanStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(atan_strided_dispatch_vector); + + using fn_ns::AtanTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(atan_output_typeid_vector); +}; + +} // namespace impl + +void init_atan(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_atan_dispatch_vectors(); + using impl::atan_contig_dispatch_vector; + using impl::atan_output_typeid_vector; + using impl::atan_strided_dispatch_vector; + + auto atan_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, atan_output_typeid_vector, + atan_contig_dispatch_vector, atan_strided_dispatch_vector); + }; + m.def("_atan", atan_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto atan_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, atan_output_typeid_vector); + }; + m.def("_atan_result_type", atan_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/atan.hpp new file mode 100644 index 0000000000..86df06699c --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/atan.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_atan(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atan2.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/atan2.cpp new file mode 100644 index 0000000000..d12a4ff540 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/atan2.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "atan2.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/atan2.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B02: ===== ATAN2 (x1, x2) +namespace impl +{ +namespace atan2_fn_ns = dpctl::tensor::kernels::atan2; + +static binary_contig_impl_fn_ptr_t + atan2_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int atan2_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + atan2_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_atan2_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = atan2_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::Atan2TypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(atan2_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::Atan2StridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(atan2_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::Atan2ContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(atan2_contig_dispatch_table); +}; + +} // namespace impl + +void init_atan2(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_atan2_dispatch_tables(); + using impl::atan2_contig_dispatch_table; + using impl::atan2_output_id_table; + using impl::atan2_strided_dispatch_table; + + auto atan2_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, atan2_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + atan2_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + atan2_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto atan2_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + atan2_output_id_table); + }; + m.def("_atan2", atan2_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_atan2_result_type", atan2_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atan2.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/atan2.hpp new file mode 100644 index 0000000000..f369d12208 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/atan2.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_atan2(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atanh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/atanh.cpp new file mode 100644 index 0000000000..c42769b8d0 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/atanh.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "atanh.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/atanh.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U07: ==== ATANH (x) +namespace impl +{ + +namespace atanh_fn_ns = dpctl::tensor::kernels::atanh; + +static unary_contig_impl_fn_ptr_t + atanh_contig_dispatch_vector[td_ns::num_types]; +static int atanh_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + atanh_strided_dispatch_vector[td_ns::num_types]; + +void populate_atanh_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = atanh_fn_ns; + + using fn_ns::AtanhContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(atanh_contig_dispatch_vector); + + using fn_ns::AtanhStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(atanh_strided_dispatch_vector); + + using fn_ns::AtanhTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(atanh_output_typeid_vector); +}; + +} // namespace impl + +void init_atanh(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_atanh_dispatch_vectors(); + using impl::atanh_contig_dispatch_vector; + using impl::atanh_output_typeid_vector; + using impl::atanh_strided_dispatch_vector; + + auto atanh_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, atanh_output_typeid_vector, + atanh_contig_dispatch_vector, atanh_strided_dispatch_vector); + }; + m.def("_atanh", atanh_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto atanh_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + atanh_output_typeid_vector); + }; + m.def("_atanh_result_type", atanh_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/atanh.hpp new file mode 100644 index 0000000000..ba2930d80e --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/atanh.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_atanh(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp new file mode 100644 index 0000000000..f86f5112cd --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp @@ -0,0 +1,190 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "bitwise_and.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/bitwise_and.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B03: ===== BITWISE_AND (x1, x2) +namespace impl +{ +namespace bitwise_and_fn_ns = dpctl::tensor::kernels::bitwise_and; + +static binary_contig_impl_fn_ptr_t + bitwise_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int bitwise_and_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + bitwise_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + bitwise_and_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + bitwise_and_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_bitwise_and_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = bitwise_and_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::BitwiseAndTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(bitwise_and_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::BitwiseAndStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(bitwise_and_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::BitwiseAndContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(bitwise_and_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::BitwiseAndInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(bitwise_and_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::BitwiseAndInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(bitwise_and_inplace_contig_dispatch_table); +}; + +} // namespace impl + +void init_bitwise_and(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_bitwise_and_dispatch_tables(); + using impl::bitwise_and_contig_dispatch_table; + using impl::bitwise_and_output_id_table; + using impl::bitwise_and_strided_dispatch_table; + + auto bitwise_and_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, bitwise_and_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + bitwise_and_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + bitwise_and_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto bitwise_and_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + bitwise_and_output_id_table); + }; + m.def("_bitwise_and", bitwise_and_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_bitwise_and_result_type", bitwise_and_result_type_pyapi, ""); + + using impl::bitwise_and_inplace_contig_dispatch_table; + using impl::bitwise_and_inplace_strided_dispatch_table; + + auto bitwise_and_inplace_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, bitwise_and_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + bitwise_and_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + bitwise_and_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_bitwise_and_inplace", bitwise_and_inplace_pyapi, "", + py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp new file mode 100644 index 0000000000..682b337efd --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_bitwise_and(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp new file mode 100644 index 0000000000..29a04cff38 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp @@ -0,0 +1,123 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "bitwise_invert.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/bitwise_invert.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U08: ===== BITWISE_INVERT (x) +namespace impl +{ + +namespace bitwise_invert_fn_ns = dpctl::tensor::kernels::bitwise_invert; + +static unary_contig_impl_fn_ptr_t + bitwise_invert_contig_dispatch_vector[td_ns::num_types]; +static int bitwise_invert_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + bitwise_invert_strided_dispatch_vector[td_ns::num_types]; + +void populate_bitwise_invert_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = bitwise_invert_fn_ns; + + using fn_ns::BitwiseInvertContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(bitwise_invert_contig_dispatch_vector); + + using fn_ns::BitwiseInvertStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(bitwise_invert_strided_dispatch_vector); + + using fn_ns::BitwiseInvertTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(bitwise_invert_output_typeid_vector); +}; + +} // namespace impl + +void init_bitwise_invert(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_bitwise_invert_dispatch_vectors(); + using impl::bitwise_invert_contig_dispatch_vector; + using impl::bitwise_invert_output_typeid_vector; + using impl::bitwise_invert_strided_dispatch_vector; + + auto bitwise_invert_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + bitwise_invert_output_typeid_vector, + bitwise_invert_contig_dispatch_vector, + bitwise_invert_strided_dispatch_vector); + }; + m.def("_bitwise_invert", bitwise_invert_pyapi, "", py::arg("src"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + auto bitwise_invert_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type( + dtype, bitwise_invert_output_typeid_vector); + }; + m.def("_bitwise_invert_result_type", bitwise_invert_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp new file mode 100644 index 0000000000..5b5d8398dc --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_bitwise_invert(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp new file mode 100644 index 0000000000..7969bc4ffa --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp @@ -0,0 +1,200 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "bitwise_left_shift.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/bitwise_left_shift.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B04: ===== BITWISE_LEFT_SHIFT (x1, x2) +namespace impl +{ +namespace bitwise_left_shift_fn_ns = dpctl::tensor::kernels::bitwise_left_shift; + +static binary_contig_impl_fn_ptr_t + bitwise_left_shift_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static int bitwise_left_shift_output_id_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + bitwise_left_shift_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + bitwise_left_shift_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + bitwise_left_shift_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_bitwise_left_shift_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = bitwise_left_shift_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::BitwiseLeftShiftTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(bitwise_left_shift_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::BitwiseLeftShiftStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(bitwise_left_shift_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::BitwiseLeftShiftContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(bitwise_left_shift_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::BitwiseLeftShiftInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table( + bitwise_left_shift_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::BitwiseLeftShiftInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table( + bitwise_left_shift_inplace_contig_dispatch_table); +}; + +} // namespace impl + +void init_bitwise_left_shift(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_bitwise_left_shift_dispatch_tables(); + using impl::bitwise_left_shift_contig_dispatch_table; + using impl::bitwise_left_shift_output_id_table; + using impl::bitwise_left_shift_strided_dispatch_table; + + auto bitwise_left_shift_pyapi = [&](const arrayT &src1, + const arrayT &src2, + const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, + bitwise_left_shift_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + bitwise_left_shift_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + bitwise_left_shift_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto bitwise_left_shift_result_type_pyapi = + [&](const py::dtype &dtype1, const py::dtype &dtype2) { + return py_binary_ufunc_result_type( + dtype1, dtype2, bitwise_left_shift_output_id_table); + }; + m.def("_bitwise_left_shift", bitwise_left_shift_pyapi, "", + py::arg("src1"), py::arg("src2"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_bitwise_left_shift_result_type", + bitwise_left_shift_result_type_pyapi, ""); + + using impl::bitwise_left_shift_inplace_contig_dispatch_table; + using impl::bitwise_left_shift_inplace_strided_dispatch_table; + + auto bitwise_left_shift_inplace_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, + bitwise_left_shift_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + bitwise_left_shift_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + bitwise_left_shift_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_bitwise_left_shift_inplace", bitwise_left_shift_inplace_pyapi, + "", py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp new file mode 100644 index 0000000000..9edcba43ab --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_bitwise_left_shift(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp new file mode 100644 index 0000000000..33a57f907c --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp @@ -0,0 +1,190 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "bitwise_or.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/bitwise_or.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B05: ===== BITWISE_OR (x1, x2) +namespace impl +{ +namespace bitwise_or_fn_ns = dpctl::tensor::kernels::bitwise_or; + +static binary_contig_impl_fn_ptr_t + bitwise_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int bitwise_or_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + bitwise_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + bitwise_or_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + bitwise_or_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_bitwise_or_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = bitwise_or_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::BitwiseOrTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(bitwise_or_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::BitwiseOrStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(bitwise_or_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::BitwiseOrContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(bitwise_or_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::BitwiseOrInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(bitwise_or_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::BitwiseOrInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(bitwise_or_inplace_contig_dispatch_table); +}; + +} // namespace impl + +void init_bitwise_or(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_bitwise_or_dispatch_tables(); + using impl::bitwise_or_contig_dispatch_table; + using impl::bitwise_or_output_id_table; + using impl::bitwise_or_strided_dispatch_table; + + auto bitwise_or_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, bitwise_or_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + bitwise_or_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + bitwise_or_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto bitwise_or_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + bitwise_or_output_id_table); + }; + m.def("_bitwise_or", bitwise_or_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_bitwise_or_result_type", bitwise_or_result_type_pyapi, ""); + + using impl::bitwise_or_inplace_contig_dispatch_table; + using impl::bitwise_or_inplace_strided_dispatch_table; + + auto bitwise_or_inplace_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, bitwise_or_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + bitwise_or_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + bitwise_or_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_bitwise_or_inplace", bitwise_or_inplace_pyapi, "", + py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp new file mode 100644 index 0000000000..7603ed8277 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_bitwise_or(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp new file mode 100644 index 0000000000..3847204b1f --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp @@ -0,0 +1,201 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "bitwise_right_shift.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/bitwise_right_shift.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B06: ===== BITWISE_RIGHT_SHIFT (x1, x2) +namespace impl +{ +namespace bitwise_right_shift_fn_ns = + dpctl::tensor::kernels::bitwise_right_shift; + +static binary_contig_impl_fn_ptr_t + bitwise_right_shift_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static int bitwise_right_shift_output_id_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + bitwise_right_shift_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + bitwise_right_shift_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + bitwise_right_shift_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_bitwise_right_shift_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = bitwise_right_shift_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::BitwiseRightShiftTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(bitwise_right_shift_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::BitwiseRightShiftStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(bitwise_right_shift_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::BitwiseRightShiftContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(bitwise_right_shift_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::BitwiseRightShiftInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table( + bitwise_right_shift_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::BitwiseRightShiftInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table( + bitwise_right_shift_inplace_contig_dispatch_table); +}; + +} // namespace impl + +void init_bitwise_right_shift(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_bitwise_right_shift_dispatch_tables(); + using impl::bitwise_right_shift_contig_dispatch_table; + using impl::bitwise_right_shift_output_id_table; + using impl::bitwise_right_shift_strided_dispatch_table; + + auto bitwise_right_shift_pyapi = [&](const arrayT &src1, + const arrayT &src2, + const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, + bitwise_right_shift_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + bitwise_right_shift_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + bitwise_right_shift_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto bitwise_right_shift_result_type_pyapi = + [&](const py::dtype &dtype1, const py::dtype &dtype2) { + return py_binary_ufunc_result_type( + dtype1, dtype2, bitwise_right_shift_output_id_table); + }; + m.def("_bitwise_right_shift", bitwise_right_shift_pyapi, "", + py::arg("src1"), py::arg("src2"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_bitwise_right_shift_result_type", + bitwise_right_shift_result_type_pyapi, ""); + + using impl::bitwise_right_shift_inplace_contig_dispatch_table; + using impl::bitwise_right_shift_inplace_strided_dispatch_table; + + auto bitwise_right_shift_inplace_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, + bitwise_right_shift_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + bitwise_right_shift_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + bitwise_right_shift_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_bitwise_right_shift_inplace", bitwise_right_shift_inplace_pyapi, + "", py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp new file mode 100644 index 0000000000..5ce2bca4e7 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_bitwise_right_shift(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp new file mode 100644 index 0000000000..71d606766f --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp @@ -0,0 +1,190 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "bitwise_xor.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/bitwise_xor.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B07: ===== BITWISE_XOR (x1, x2) +namespace impl +{ +namespace bitwise_xor_fn_ns = dpctl::tensor::kernels::bitwise_xor; + +static binary_contig_impl_fn_ptr_t + bitwise_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int bitwise_xor_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + bitwise_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + bitwise_xor_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + bitwise_xor_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_bitwise_xor_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = bitwise_xor_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::BitwiseXorTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(bitwise_xor_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::BitwiseXorStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(bitwise_xor_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::BitwiseXorContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(bitwise_xor_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::BitwiseXorInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(bitwise_xor_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::BitwiseXorInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(bitwise_xor_inplace_contig_dispatch_table); +}; + +} // namespace impl + +void init_bitwise_xor(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_bitwise_xor_dispatch_tables(); + using impl::bitwise_xor_contig_dispatch_table; + using impl::bitwise_xor_output_id_table; + using impl::bitwise_xor_strided_dispatch_table; + + auto bitwise_xor_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, bitwise_xor_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + bitwise_xor_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + bitwise_xor_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto bitwise_xor_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + bitwise_xor_output_id_table); + }; + m.def("_bitwise_xor", bitwise_xor_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_bitwise_xor_result_type", bitwise_xor_result_type_pyapi, ""); + + using impl::bitwise_xor_inplace_contig_dispatch_table; + using impl::bitwise_xor_inplace_strided_dispatch_table; + + auto bitwise_xor_inplace_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, bitwise_xor_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + bitwise_xor_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + bitwise_xor_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_bitwise_xor_inplace", bitwise_xor_inplace_pyapi, "", + py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp new file mode 100644 index 0000000000..7b092aadda --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_bitwise_xor(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.cpp new file mode 100644 index 0000000000..b42f234c0d --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "cbrt.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/cbrt.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U37: ==== CBRT (x) +namespace impl +{ + +namespace cbrt_fn_ns = dpctl::tensor::kernels::cbrt; + +static unary_contig_impl_fn_ptr_t cbrt_contig_dispatch_vector[td_ns::num_types]; +static int cbrt_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + cbrt_strided_dispatch_vector[td_ns::num_types]; + +void populate_cbrt_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = cbrt_fn_ns; + + using fn_ns::CbrtContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(cbrt_contig_dispatch_vector); + + using fn_ns::CbrtStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(cbrt_strided_dispatch_vector); + + using fn_ns::CbrtTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(cbrt_output_typeid_vector); +}; + +} // namespace impl + +void init_cbrt(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_cbrt_dispatch_vectors(); + using impl::cbrt_contig_dispatch_vector; + using impl::cbrt_output_typeid_vector; + using impl::cbrt_strided_dispatch_vector; + + auto cbrt_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, cbrt_output_typeid_vector, + cbrt_contig_dispatch_vector, cbrt_strided_dispatch_vector); + }; + m.def("_cbrt", cbrt_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto cbrt_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, cbrt_output_typeid_vector); + }; + m.def("_cbrt_result_type", cbrt_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.hpp new file mode 100644 index 0000000000..74da1de81a --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_cbrt(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/ceil.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/ceil.cpp new file mode 100644 index 0000000000..f1bb362c5b --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/ceil.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "ceil.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/ceil.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U09: ==== CEIL (x) +namespace impl +{ + +namespace ceil_fn_ns = dpctl::tensor::kernels::ceil; + +static unary_contig_impl_fn_ptr_t ceil_contig_dispatch_vector[td_ns::num_types]; +static int ceil_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + ceil_strided_dispatch_vector[td_ns::num_types]; + +void populate_ceil_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = ceil_fn_ns; + + using fn_ns::CeilContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(ceil_contig_dispatch_vector); + + using fn_ns::CeilStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(ceil_strided_dispatch_vector); + + using fn_ns::CeilTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(ceil_output_typeid_vector); +}; + +} // namespace impl + +void init_ceil(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_ceil_dispatch_vectors(); + using impl::ceil_contig_dispatch_vector; + using impl::ceil_output_typeid_vector; + using impl::ceil_strided_dispatch_vector; + + auto ceil_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, ceil_output_typeid_vector, + ceil_contig_dispatch_vector, ceil_strided_dispatch_vector); + }; + m.def("_ceil", ceil_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto ceil_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, ceil_output_typeid_vector); + }; + m.def("_ceil_result_type", ceil_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/ceil.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/ceil.hpp new file mode 100644 index 0000000000..4a6caf999b --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/ceil.hpp @@ -0,0 +1,44 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_ceil(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/conj.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/conj.cpp new file mode 100644 index 0000000000..cac84e63fb --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/conj.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "conj.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/conj.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U10: ==== CONJ (x) +namespace impl +{ + +namespace conj_fn_ns = dpctl::tensor::kernels::conj; + +static unary_contig_impl_fn_ptr_t conj_contig_dispatch_vector[td_ns::num_types]; +static int conj_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + conj_strided_dispatch_vector[td_ns::num_types]; + +void populate_conj_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = conj_fn_ns; + + using fn_ns::ConjContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(conj_contig_dispatch_vector); + + using fn_ns::ConjStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(conj_strided_dispatch_vector); + + using fn_ns::ConjTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(conj_output_typeid_vector); +}; + +} // namespace impl + +void init_conj(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_conj_dispatch_vectors(); + using impl::conj_contig_dispatch_vector; + using impl::conj_output_typeid_vector; + using impl::conj_strided_dispatch_vector; + + auto conj_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, conj_output_typeid_vector, + conj_contig_dispatch_vector, conj_strided_dispatch_vector); + }; + m.def("_conj", conj_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto conj_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, conj_output_typeid_vector); + }; + m.def("_conj_result_type", conj_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/conj.hpp new file mode 100644 index 0000000000..33d9993019 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/conj.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_conj(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/copysign.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/copysign.cpp new file mode 100644 index 0000000000..6a887e0345 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/copysign.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "copysign.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/copysign.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B25: ===== COPYSIGN (x1, x2) +namespace impl +{ +namespace copysign_fn_ns = dpctl::tensor::kernels::copysign; + +static binary_contig_impl_fn_ptr_t + copysign_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int copysign_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + copysign_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_copysign_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = copysign_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::CopysignTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(copysign_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::CopysignStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(copysign_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::CopysignContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(copysign_contig_dispatch_table); +}; + +} // namespace impl + +void init_copysign(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_copysign_dispatch_tables(); + using impl::copysign_contig_dispatch_table; + using impl::copysign_output_id_table; + using impl::copysign_strided_dispatch_table; + + auto copysign_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, copysign_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + copysign_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + copysign_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto copysign_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + copysign_output_id_table); + }; + m.def("_copysign", copysign_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_copysign_result_type", copysign_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/copysign.hpp new file mode 100644 index 0000000000..d22cbdb0f0 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/copysign.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_copysign(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cos.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/cos.cpp new file mode 100644 index 0000000000..1986610510 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/cos.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "cos.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/cos.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U11: ==== COS (x) +namespace impl +{ + +namespace cos_fn_ns = dpctl::tensor::kernels::cos; + +static unary_contig_impl_fn_ptr_t cos_contig_dispatch_vector[td_ns::num_types]; +static int cos_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + cos_strided_dispatch_vector[td_ns::num_types]; + +void populate_cos_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = cos_fn_ns; + + using fn_ns::CosContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(cos_contig_dispatch_vector); + + using fn_ns::CosStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(cos_strided_dispatch_vector); + + using fn_ns::CosTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(cos_output_typeid_vector); +}; + +} // namespace impl + +void init_cos(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_cos_dispatch_vectors(); + using impl::cos_contig_dispatch_vector; + using impl::cos_output_typeid_vector; + using impl::cos_strided_dispatch_vector; + + auto cos_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, cos_output_typeid_vector, + cos_contig_dispatch_vector, cos_strided_dispatch_vector); + }; + m.def("_cos", cos_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto cos_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, cos_output_typeid_vector); + }; + m.def("_cos_result_type", cos_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/cos.hpp new file mode 100644 index 0000000000..1753058024 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/cos.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_cos(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cosh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/cosh.cpp new file mode 100644 index 0000000000..0bb74df979 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/cosh.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "cosh.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/cosh.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U12: ==== COSH (x) +namespace impl +{ + +namespace cosh_fn_ns = dpctl::tensor::kernels::cosh; + +static unary_contig_impl_fn_ptr_t cosh_contig_dispatch_vector[td_ns::num_types]; +static int cosh_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + cosh_strided_dispatch_vector[td_ns::num_types]; + +void populate_cosh_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = cosh_fn_ns; + + using fn_ns::CoshContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(cosh_contig_dispatch_vector); + + using fn_ns::CoshStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(cosh_strided_dispatch_vector); + + using fn_ns::CoshTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(cosh_output_typeid_vector); +}; + +} // namespace impl + +void init_cosh(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_cosh_dispatch_vectors(); + using impl::cosh_contig_dispatch_vector; + using impl::cosh_output_typeid_vector; + using impl::cosh_strided_dispatch_vector; + + auto cosh_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, cosh_output_typeid_vector, + cosh_contig_dispatch_vector, cosh_strided_dispatch_vector); + }; + m.def("_cosh", cosh_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto cosh_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, cosh_output_typeid_vector); + }; + m.def("_cosh_result_type", cosh_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cosh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/cosh.hpp new file mode 100644 index 0000000000..c1eba05ea5 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/cosh.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_cosh(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp new file mode 100644 index 0000000000..751e44ff55 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp @@ -0,0 +1,181 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include + +#include "abs.hpp" +#include "acos.hpp" +#include "acosh.hpp" +#include "add.hpp" +#include "asin.hpp" +#include "asinh.hpp" +#include "atan.hpp" +#include "atan2.hpp" +#include "atanh.hpp" +#include "bitwise_and.hpp" +#include "bitwise_invert.hpp" +#include "bitwise_left_shift.hpp" +#include "bitwise_or.hpp" +#include "bitwise_right_shift.hpp" +#include "bitwise_xor.hpp" +#include "cbrt.hpp" +#include "ceil.hpp" +#include "conj.hpp" +#include "copysign.hpp" +#include "cos.hpp" +#include "cosh.hpp" +#include "equal.hpp" +#include "exp.hpp" +#include "exp2.hpp" +#include "expm1.hpp" +#include "floor.hpp" +#include "floor_divide.hpp" +#include "greater.hpp" +#include "greater_equal.hpp" +#include "hypot.hpp" +#include "imag.hpp" +#include "isfinite.hpp" +#include "isinf.hpp" +#include "isnan.hpp" +#include "less.hpp" +#include "less_equal.hpp" +#include "log.hpp" +#include "log10.hpp" +#include "log1p.hpp" +#include "log2.hpp" +#include "logaddexp.hpp" +#include "logical_and.hpp" +#include "logical_not.hpp" +#include "logical_or.hpp" +#include "logical_xor.hpp" +#include "maximum.hpp" +#include "minimum.hpp" +#include "multiply.hpp" +#include "negative.hpp" +#include "not_equal.hpp" +#include "positive.hpp" +#include "pow.hpp" +#include "proj.hpp" +#include "real.hpp" +#include "remainder.hpp" +#include "round.hpp" +#include "rsqrt.hpp" +#include "sign.hpp" +#include "signbit.hpp" +#include "sin.hpp" +#include "sinh.hpp" +#include "sqrt.hpp" +#include "square.hpp" +#include "subtract.hpp" +#include "tan.hpp" +#include "tanh.hpp" +#include "true_divide.hpp" +#include "trunc.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +/*! @brief Add elementwise functions to Python module */ +void init_elementwise_functions(py::module_ m) +{ + init_abs(m); + init_acos(m); + init_acosh(m); + init_add(m); + init_asin(m); + init_asinh(m); + init_atan(m); + init_atan2(m); + init_atanh(m); + init_bitwise_and(m); + init_bitwise_invert(m); + init_bitwise_left_shift(m); + init_bitwise_or(m); + init_bitwise_right_shift(m); + init_bitwise_xor(m); + init_cbrt(m); + init_ceil(m); + init_conj(m); + init_copysign(m); + init_cos(m); + init_cosh(m); + init_divide(m); + init_equal(m); + init_exp(m); + init_exp2(m); + init_expm1(m); + init_floor(m); + init_floor_divide(m); + init_greater(m); + init_greater_equal(m); + init_hypot(m); + init_imag(m); + init_isfinite(m); + init_isinf(m); + init_isnan(m); + init_less(m); + init_less_equal(m); + init_log(m); + init_log10(m); + init_log1p(m); + init_log2(m); + init_logaddexp(m); + init_logical_and(m); + init_logical_not(m); + init_logical_or(m); + init_logical_xor(m); + init_maximum(m); + init_minimum(m); + init_multiply(m); + init_negative(m); + init_not_equal(m); + init_positive(m); + init_pow(m); + init_proj(m); + init_real(m); + init_remainder(m); + init_round(m); + init_rsqrt(m); + init_sign(m); + init_signbit(m); + init_sin(m); + init_sinh(m); + init_sqrt(m); + init_square(m); + init_subtract(m); + init_tan(m); + init_tanh(m); + init_trunc(m); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp new file mode 100644 index 0000000000..ef9182f9a2 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_elementwise_functions(py::module_); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp similarity index 97% rename from dpctl/tensor/libtensor/source/elementwise_functions.hpp rename to dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp index 523e4259c3..6817a3541c 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions.hpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp @@ -22,7 +22,6 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions, /// specifically functions for elementwise operations. //===----------------------------------------------------------------------===// - #pragma once #include "dpctl4pybind11.hpp" @@ -30,14 +29,17 @@ #include #include #include -#include #include +#include "elementwise_functions_type_utils.hpp" #include "simplify_iteration_space.hpp" #include "utils/memory_overlap.hpp" #include "utils/offset_utils.hpp" #include "utils/type_dispatch.hpp" +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + namespace dpctl { namespace tensor @@ -45,11 +47,7 @@ namespace tensor namespace py_internal { -namespace td_ns = dpctl::tensor::type_dispatch; - -extern py::dtype _dtype_from_typenum(td_ns::typenum_t dst_typenum_t); -extern int _result_typeid(int arg_typeid, const int *fn_output_id); - +/*! @brief Template implementing Python API for unary elementwise functions */ template @@ -251,6 +249,8 @@ py_unary_ufunc(const dpctl::tensor::usm_ndarray &src, strided_fn_ev); } +/*! @brief Template implementing Python API for querying of type support by + * unary elementwise functions */ template py::object py_unary_ufunc_result_type(const py::dtype &input_dtype, const output_typesT &output_types) @@ -266,6 +266,7 @@ py::object py_unary_ufunc_result_type(const py::dtype &input_dtype, throw py::value_error(e.what()); } + using dpctl::tensor::py_internal::type_utils::_result_typeid; int dst_typeid = _result_typeid(src_typeid, output_types); if (dst_typeid < 0) { @@ -273,8 +274,9 @@ py::object py_unary_ufunc_result_type(const py::dtype &input_dtype, return py::cast(res); } else { - auto dst_typenum_t = static_cast(dst_typeid); + using dpctl::tensor::py_internal::type_utils::_dtype_from_typenum; + auto dst_typenum_t = static_cast(dst_typeid); auto dt = _dtype_from_typenum(dst_typenum_t); return py::cast(dt); @@ -292,6 +294,8 @@ bool isEqual(Container const &c, std::initializer_list const &l) } } // namespace +/*! @brief Template implementing Python API for binary elementwise + * functions */ template py_binary_ufunc( strided_fn_ev); } +/*! @brief Type querying for binary elementwise functions */ template py::object py_binary_ufunc_result_type(const py::dtype &input1_dtype, const py::dtype &input2_dtype, @@ -590,8 +595,9 @@ py::object py_binary_ufunc_result_type(const py::dtype &input1_dtype, return py::cast(res); } else { - auto dst_typenum_t = static_cast(dst_typeid); + using dpctl::tensor::py_internal::type_utils::_dtype_from_typenum; + auto dst_typenum_t = static_cast(dst_typeid); auto dt = _dtype_from_typenum(dst_typenum_t); return py::cast(dt); @@ -825,8 +831,6 @@ py_binary_inplace_ufunc(const dpctl::tensor::usm_ndarray &lhs, strided_fn_ev); } -extern void init_elementwise_functions(py::module_ m); - } // namespace py_internal } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp new file mode 100644 index 0000000000..473048e8fa --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp @@ -0,0 +1,95 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions for looking of supported types in elementwise +/// functions. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions_type_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ +namespace type_utils +{ + +py::dtype _dtype_from_typenum(td_ns::typenum_t dst_typenum_t) +{ + switch (dst_typenum_t) { + case td_ns::typenum_t::BOOL: + return py::dtype("?"); + case td_ns::typenum_t::INT8: + return py::dtype("i1"); + case td_ns::typenum_t::UINT8: + return py::dtype("u1"); + case td_ns::typenum_t::INT16: + return py::dtype("i2"); + case td_ns::typenum_t::UINT16: + return py::dtype("u2"); + case td_ns::typenum_t::INT32: + return py::dtype("i4"); + case td_ns::typenum_t::UINT32: + return py::dtype("u4"); + case td_ns::typenum_t::INT64: + return py::dtype("i8"); + case td_ns::typenum_t::UINT64: + return py::dtype("u8"); + case td_ns::typenum_t::HALF: + return py::dtype("f2"); + case td_ns::typenum_t::FLOAT: + return py::dtype("f4"); + case td_ns::typenum_t::DOUBLE: + return py::dtype("f8"); + case td_ns::typenum_t::CFLOAT: + return py::dtype("c8"); + case td_ns::typenum_t::CDOUBLE: + return py::dtype("c16"); + default: + throw py::value_error("Unrecognized dst_typeid"); + } +} + +int _result_typeid(int arg_typeid, const int *fn_output_id) +{ + if (arg_typeid < 0 || arg_typeid >= td_ns::num_types) { + throw py::value_error("Input typeid " + std::to_string(arg_typeid) + + " is outside of expected bounds."); + } + + return fn_output_id[arg_typeid]; +} + +} // namespace type_utils +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp new file mode 100644 index 0000000000..6dac195dc2 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp @@ -0,0 +1,56 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file declares functions for looking of supported types in elementwise +/// functions. +//===----------------------------------------------------------------------===// + +#pragma once +#include "dpctl4pybind11.hpp" +#include +#include +#include + +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ +namespace type_utils +{ + +/*! @brief Produce dtype from a type number */ +extern py::dtype _dtype_from_typenum(td_ns::typenum_t); + +/*! @brief Lookup typeid of the result from typeid of + * argument and the mapping table */ +extern int _result_typeid(int, const int *); + +} // namespace type_utils +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/equal.cpp new file mode 100644 index 0000000000..f36ec1b446 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/equal.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "equal.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/equal.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B09: ===== EQUAL (x1, x2) +namespace impl +{ +namespace equal_fn_ns = dpctl::tensor::kernels::equal; + +static binary_contig_impl_fn_ptr_t + equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int equal_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_equal_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = equal_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::EqualTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(equal_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::EqualStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(equal_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::EqualContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(equal_contig_dispatch_table); +}; + +} // namespace impl + +void init_equal(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_equal_dispatch_tables(); + using impl::equal_contig_dispatch_table; + using impl::equal_output_id_table; + using impl::equal_strided_dispatch_table; + + auto equal_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, equal_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + equal_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + equal_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto equal_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + equal_output_id_table); + }; + m.def("_equal", equal_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_equal_result_type", equal_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/equal.hpp new file mode 100644 index 0000000000..21ac4ad6b4 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/equal.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_equal(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/exp.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/exp.cpp new file mode 100644 index 0000000000..51ccaaac70 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/exp.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "exp.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/exp.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U13: ==== EXP (x) +namespace impl +{ + +namespace exp_fn_ns = dpctl::tensor::kernels::exp; + +static unary_contig_impl_fn_ptr_t exp_contig_dispatch_vector[td_ns::num_types]; +static int exp_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + exp_strided_dispatch_vector[td_ns::num_types]; + +void populate_exp_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = exp_fn_ns; + + using fn_ns::ExpContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(exp_contig_dispatch_vector); + + using fn_ns::ExpStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(exp_strided_dispatch_vector); + + using fn_ns::ExpTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(exp_output_typeid_vector); +}; + +} // namespace impl + +void init_exp(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_exp_dispatch_vectors(); + using impl::exp_contig_dispatch_vector; + using impl::exp_output_typeid_vector; + using impl::exp_strided_dispatch_vector; + + auto exp_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, exp_output_typeid_vector, + exp_contig_dispatch_vector, exp_strided_dispatch_vector); + }; + m.def("_exp", exp_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto exp_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, exp_output_typeid_vector); + }; + m.def("_exp_result_type", exp_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/exp.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/exp.hpp new file mode 100644 index 0000000000..7227f0a2dc --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/exp.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_exp(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/exp2.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/exp2.cpp new file mode 100644 index 0000000000..438ad0800e --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/exp2.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "exp2.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/exp2.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U38: ==== EXP2 (x) +namespace impl +{ + +namespace exp2_fn_ns = dpctl::tensor::kernels::exp2; + +static unary_contig_impl_fn_ptr_t exp2_contig_dispatch_vector[td_ns::num_types]; +static int exp2_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + exp2_strided_dispatch_vector[td_ns::num_types]; + +void populate_exp2_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = exp2_fn_ns; + + using fn_ns::Exp2ContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(exp2_contig_dispatch_vector); + + using fn_ns::Exp2StridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(exp2_strided_dispatch_vector); + + using fn_ns::Exp2TypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(exp2_output_typeid_vector); +}; + +} // namespace impl + +void init_exp2(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_exp2_dispatch_vectors(); + using impl::exp2_contig_dispatch_vector; + using impl::exp2_output_typeid_vector; + using impl::exp2_strided_dispatch_vector; + + auto exp2_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, exp2_output_typeid_vector, + exp2_contig_dispatch_vector, exp2_strided_dispatch_vector); + }; + m.def("_exp2", exp2_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto exp2_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, exp2_output_typeid_vector); + }; + m.def("_exp2_result_type", exp2_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/exp2.hpp new file mode 100644 index 0000000000..be041e1f8d --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/exp2.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_exp2(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/expm1.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/expm1.cpp new file mode 100644 index 0000000000..3b9332c4f1 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/expm1.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "expm1.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/expm1.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U14: ==== EXPM1 (x) +namespace impl +{ + +namespace expm1_fn_ns = dpctl::tensor::kernels::expm1; + +static unary_contig_impl_fn_ptr_t + expm1_contig_dispatch_vector[td_ns::num_types]; +static int expm1_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + expm1_strided_dispatch_vector[td_ns::num_types]; + +void populate_expm1_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = expm1_fn_ns; + + using fn_ns::Expm1ContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(expm1_contig_dispatch_vector); + + using fn_ns::Expm1StridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(expm1_strided_dispatch_vector); + + using fn_ns::Expm1TypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(expm1_output_typeid_vector); +}; + +} // namespace impl + +void init_expm1(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_expm1_dispatch_vectors(); + using impl::expm1_contig_dispatch_vector; + using impl::expm1_output_typeid_vector; + using impl::expm1_strided_dispatch_vector; + + auto expm1_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, expm1_output_typeid_vector, + expm1_contig_dispatch_vector, expm1_strided_dispatch_vector); + }; + m.def("_expm1", expm1_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto expm1_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + expm1_output_typeid_vector); + }; + m.def("_expm1_result_type", expm1_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/expm1.hpp new file mode 100644 index 0000000000..6e39644835 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/expm1.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_expm1(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/floor.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/floor.cpp new file mode 100644 index 0000000000..9ccf89f13a --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/floor.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "floor.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/floor.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U15: ==== FLOOR (x) +namespace impl +{ + +namespace floor_fn_ns = dpctl::tensor::kernels::floor; + +static unary_contig_impl_fn_ptr_t + floor_contig_dispatch_vector[td_ns::num_types]; +static int floor_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + floor_strided_dispatch_vector[td_ns::num_types]; + +void populate_floor_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = floor_fn_ns; + + using fn_ns::FloorContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(floor_contig_dispatch_vector); + + using fn_ns::FloorStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(floor_strided_dispatch_vector); + + using fn_ns::FloorTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(floor_output_typeid_vector); +}; + +} // namespace impl + +void init_floor(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_floor_dispatch_vectors(); + using impl::floor_contig_dispatch_vector; + using impl::floor_output_typeid_vector; + using impl::floor_strided_dispatch_vector; + + auto floor_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, floor_output_typeid_vector, + floor_contig_dispatch_vector, floor_strided_dispatch_vector); + }; + m.def("_floor", floor_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto floor_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + floor_output_typeid_vector); + }; + m.def("_floor_result_type", floor_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/floor.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/floor.hpp new file mode 100644 index 0000000000..b742b058ad --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/floor.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_floor(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.cpp new file mode 100644 index 0000000000..e75fc56c67 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.cpp @@ -0,0 +1,190 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "floor_divide.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" +#include "kernels/elementwise_functions/floor_divide.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B10: ===== FLOOR_DIVIDE (x1, x2) +namespace impl +{ +namespace floor_divide_fn_ns = dpctl::tensor::kernels::floor_divide; + +static binary_contig_impl_fn_ptr_t + floor_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int floor_divide_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + floor_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + floor_divide_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + floor_divide_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_floor_divide_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = floor_divide_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::FloorDivideTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(floor_divide_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::FloorDivideStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(floor_divide_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::FloorDivideContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(floor_divide_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::FloorDivideInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(floor_divide_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::FloorDivideInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(floor_divide_inplace_contig_dispatch_table); +}; + +} // namespace impl + +void init_floor_divide(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_floor_divide_dispatch_tables(); + using impl::floor_divide_contig_dispatch_table; + using impl::floor_divide_output_id_table; + using impl::floor_divide_strided_dispatch_table; + + auto floor_divide_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, floor_divide_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + floor_divide_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + floor_divide_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto floor_divide_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + floor_divide_output_id_table); + }; + m.def("_floor_divide", floor_divide_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_floor_divide_result_type", floor_divide_result_type_pyapi, ""); + + using impl::floor_divide_inplace_contig_dispatch_table; + using impl::floor_divide_inplace_strided_dispatch_table; + + auto floor_divide_inplace_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, floor_divide_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + floor_divide_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + floor_divide_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_floor_divide_inplace", floor_divide_inplace_pyapi, "", + py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.hpp new file mode 100644 index 0000000000..c7f0d40dcc --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_floor_divide(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/greater.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/greater.cpp new file mode 100644 index 0000000000..f79102df47 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/greater.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "greater.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/greater.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B11: ===== GREATER (x1, x2) +namespace impl +{ +namespace greater_fn_ns = dpctl::tensor::kernels::greater; + +static binary_contig_impl_fn_ptr_t + greater_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int greater_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + greater_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_greater_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = greater_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::GreaterTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(greater_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::GreaterStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(greater_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::GreaterContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(greater_contig_dispatch_table); +}; + +} // namespace impl + +void init_greater(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_greater_dispatch_tables(); + using impl::greater_contig_dispatch_table; + using impl::greater_output_id_table; + using impl::greater_strided_dispatch_table; + + auto greater_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, greater_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + greater_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + greater_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto greater_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + greater_output_id_table); + }; + m.def("_greater", greater_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_greater_result_type", greater_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/greater.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/greater.hpp new file mode 100644 index 0000000000..ba8dc57bb0 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/greater.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_greater(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.cpp new file mode 100644 index 0000000000..005679c3fb --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.cpp @@ -0,0 +1,141 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "greater_equal.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/greater_equal.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B12: ===== GREATER_EQUAL (x1, x2) +namespace impl +{ +namespace greater_equal_fn_ns = dpctl::tensor::kernels::greater_equal; + +static binary_contig_impl_fn_ptr_t + greater_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int greater_equal_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + greater_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_greater_equal_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = greater_equal_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::GreaterEqualTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(greater_equal_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::GreaterEqualStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(greater_equal_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::GreaterEqualContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(greater_equal_contig_dispatch_table); +}; + +} // namespace impl + +void init_greater_equal(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_greater_equal_dispatch_tables(); + using impl::greater_equal_contig_dispatch_table; + using impl::greater_equal_output_id_table; + using impl::greater_equal_strided_dispatch_table; + + auto greater_equal_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, greater_equal_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + greater_equal_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + greater_equal_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto greater_equal_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + greater_equal_output_id_table); + }; + m.def("_greater_equal", greater_equal_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_greater_equal_result_type", greater_equal_result_type_pyapi, + ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.hpp new file mode 100644 index 0000000000..2cf116566e --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_greater_equal(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/hypot.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/hypot.cpp new file mode 100644 index 0000000000..2442710198 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/hypot.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "hypot.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/hypot.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B24: ===== HYPOT (x1, x2) +namespace impl +{ +namespace hypot_fn_ns = dpctl::tensor::kernels::hypot; + +static binary_contig_impl_fn_ptr_t + hypot_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int hypot_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + hypot_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_hypot_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = hypot_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::HypotTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(hypot_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::HypotStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(hypot_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::HypotContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(hypot_contig_dispatch_table); +}; + +} // namespace impl + +void init_hypot(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_hypot_dispatch_tables(); + using impl::hypot_contig_dispatch_table; + using impl::hypot_output_id_table; + using impl::hypot_strided_dispatch_table; + + auto hypot_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, hypot_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + hypot_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + hypot_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto hypot_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + hypot_output_id_table); + }; + m.def("_hypot", hypot_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_hypot_result_type", hypot_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/hypot.hpp new file mode 100644 index 0000000000..2d154917ea --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/hypot.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_hypot(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/imag.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/imag.cpp new file mode 100644 index 0000000000..4012b9206f --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/imag.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "imag.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/imag.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U16: ==== IMAG (x) +namespace impl +{ + +namespace imag_fn_ns = dpctl::tensor::kernels::imag; + +static unary_contig_impl_fn_ptr_t imag_contig_dispatch_vector[td_ns::num_types]; +static int imag_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + imag_strided_dispatch_vector[td_ns::num_types]; + +void populate_imag_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = imag_fn_ns; + + using fn_ns::ImagContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(imag_contig_dispatch_vector); + + using fn_ns::ImagStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(imag_strided_dispatch_vector); + + using fn_ns::ImagTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(imag_output_typeid_vector); +}; + +} // namespace impl + +void init_imag(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_imag_dispatch_vectors(); + using impl::imag_contig_dispatch_vector; + using impl::imag_output_typeid_vector; + using impl::imag_strided_dispatch_vector; + + auto imag_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, imag_output_typeid_vector, + imag_contig_dispatch_vector, imag_strided_dispatch_vector); + }; + m.def("_imag", imag_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto imag_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, imag_output_typeid_vector); + }; + m.def("_imag_result_type", imag_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/imag.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/imag.hpp new file mode 100644 index 0000000000..ffac3f2465 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/imag.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_isfinite(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.cpp new file mode 100644 index 0000000000..73a2be4010 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.cpp @@ -0,0 +1,122 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "isfinite.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/isfinite.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U17: ==== ISFINITE (x) +namespace impl +{ + +namespace isfinite_fn_ns = dpctl::tensor::kernels::isfinite; + +static unary_contig_impl_fn_ptr_t + isfinite_contig_dispatch_vector[td_ns::num_types]; +static int isfinite_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + isfinite_strided_dispatch_vector[td_ns::num_types]; + +void populate_isfinite_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = isfinite_fn_ns; + + using fn_ns::IsFiniteContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(isfinite_contig_dispatch_vector); + + using fn_ns::IsFiniteStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(isfinite_strided_dispatch_vector); + + using fn_ns::IsFiniteTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(isfinite_output_typeid_vector); +}; + +} // namespace impl + +void init_isfinite(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_isfinite_dispatch_vectors(); + using impl::isfinite_contig_dispatch_vector; + using impl::isfinite_output_typeid_vector; + using impl::isfinite_strided_dispatch_vector; + + auto isfinite_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + isfinite_output_typeid_vector, + isfinite_contig_dispatch_vector, + isfinite_strided_dispatch_vector); + }; + m.def("_isfinite", isfinite_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto isfinite_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + isfinite_output_typeid_vector); + }; + m.def("_isfinite_result_type", isfinite_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.hpp new file mode 100644 index 0000000000..fd7508792b --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_imag(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isinf.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/isinf.cpp new file mode 100644 index 0000000000..2600fe4f74 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/isinf.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "isinf.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/isinf.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U18: ==== ISINF (x) +namespace impl +{ + +namespace isinf_fn_ns = dpctl::tensor::kernels::isinf; + +static unary_contig_impl_fn_ptr_t + isinf_contig_dispatch_vector[td_ns::num_types]; +static int isinf_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + isinf_strided_dispatch_vector[td_ns::num_types]; + +void populate_isinf_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = isinf_fn_ns; + + using fn_ns::IsInfContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(isinf_contig_dispatch_vector); + + using fn_ns::IsInfStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(isinf_strided_dispatch_vector); + + using fn_ns::IsInfTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(isinf_output_typeid_vector); +}; + +} // namespace impl + +void init_isinf(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_isinf_dispatch_vectors(); + using impl::isinf_contig_dispatch_vector; + using impl::isinf_output_typeid_vector; + using impl::isinf_strided_dispatch_vector; + + auto isinf_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, isinf_output_typeid_vector, + isinf_contig_dispatch_vector, isinf_strided_dispatch_vector); + }; + m.def("_isinf", isinf_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto isinf_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + isinf_output_typeid_vector); + }; + m.def("_isinf_result_type", isinf_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/isinf.hpp new file mode 100644 index 0000000000..8c3cd51c91 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/isinf.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_isinf(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isnan.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/isnan.cpp new file mode 100644 index 0000000000..b75618c5e0 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/isnan.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "isnan.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/isnan.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U19: ==== ISNAN (x) +namespace impl +{ + +namespace isnan_fn_ns = dpctl::tensor::kernels::isnan; + +static unary_contig_impl_fn_ptr_t + isnan_contig_dispatch_vector[td_ns::num_types]; +static int isnan_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + isnan_strided_dispatch_vector[td_ns::num_types]; + +void populate_isnan_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = isnan_fn_ns; + + using fn_ns::IsNanContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(isnan_contig_dispatch_vector); + + using fn_ns::IsNanStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(isnan_strided_dispatch_vector); + + using fn_ns::IsNanTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(isnan_output_typeid_vector); +}; + +} // namespace impl + +void init_isnan(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_isnan_dispatch_vectors(); + using impl::isnan_contig_dispatch_vector; + using impl::isnan_output_typeid_vector; + using impl::isnan_strided_dispatch_vector; + + auto isnan_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, isnan_output_typeid_vector, + isnan_contig_dispatch_vector, isnan_strided_dispatch_vector); + }; + m.def("_isnan", isnan_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto isnan_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + isnan_output_typeid_vector); + }; + m.def("_isnan_result_type", isnan_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/isnan.hpp new file mode 100644 index 0000000000..df1f41d47f --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/isnan.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_isnan(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/less.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/less.cpp new file mode 100644 index 0000000000..c34122d862 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/less.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "less.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/less.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B13: ===== LESS (x1, x2) +namespace impl +{ +namespace less_fn_ns = dpctl::tensor::kernels::less; + +static binary_contig_impl_fn_ptr_t less_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static int less_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + less_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_less_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = less_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::LessTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(less_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::LessStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(less_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::LessContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(less_contig_dispatch_table); +}; + +} // namespace impl + +void init_less(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_less_dispatch_tables(); + using impl::less_contig_dispatch_table; + using impl::less_output_id_table; + using impl::less_strided_dispatch_table; + + auto less_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, less_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + less_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + less_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto less_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + less_output_id_table); + }; + m.def("_less", less_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_less_result_type", less_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/less.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/less.hpp new file mode 100644 index 0000000000..dada4b4be7 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/less.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_less(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.cpp new file mode 100644 index 0000000000..712b30d902 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "less_equal.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/less_equal.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B14: ===== LESS_EQUAL (x1, x2) +namespace impl +{ +namespace less_equal_fn_ns = dpctl::tensor::kernels::less_equal; + +static binary_contig_impl_fn_ptr_t + less_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int less_equal_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + less_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_less_equal_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = less_equal_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::LessEqualTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(less_equal_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::LessEqualStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(less_equal_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::LessEqualContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(less_equal_contig_dispatch_table); +}; + +} // namespace impl + +void init_less_equal(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_less_equal_dispatch_tables(); + using impl::less_equal_contig_dispatch_table; + using impl::less_equal_output_id_table; + using impl::less_equal_strided_dispatch_table; + + auto less_equal_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, less_equal_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + less_equal_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + less_equal_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto less_equal_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + less_equal_output_id_table); + }; + m.def("_less_equal", less_equal_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_less_equal_result_type", less_equal_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.hpp new file mode 100644 index 0000000000..e52ee3b940 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_less_equal(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log.cpp new file mode 100644 index 0000000000..f73b9e2414 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "log.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/log.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U20: ==== LOG (x) +namespace impl +{ + +namespace log_fn_ns = dpctl::tensor::kernels::log; + +static unary_contig_impl_fn_ptr_t log_contig_dispatch_vector[td_ns::num_types]; +static int log_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + log_strided_dispatch_vector[td_ns::num_types]; + +void populate_log_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = log_fn_ns; + + using fn_ns::LogContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(log_contig_dispatch_vector); + + using fn_ns::LogStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(log_strided_dispatch_vector); + + using fn_ns::LogTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(log_output_typeid_vector); +}; + +} // namespace impl + +void init_log(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_log_dispatch_vectors(); + using impl::log_contig_dispatch_vector; + using impl::log_output_typeid_vector; + using impl::log_strided_dispatch_vector; + + auto log_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, log_output_typeid_vector, + log_contig_dispatch_vector, log_strided_dispatch_vector); + }; + m.def("_log", log_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto log_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, log_output_typeid_vector); + }; + m.def("_log_result_type", log_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/log.hpp new file mode 100644 index 0000000000..1ca152d174 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_log(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log10.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log10.cpp new file mode 100644 index 0000000000..566dfcbcf7 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log10.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "log10.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/log10.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U23: ==== LOG10 (x) +namespace impl +{ + +namespace log10_fn_ns = dpctl::tensor::kernels::log10; + +static unary_contig_impl_fn_ptr_t + log10_contig_dispatch_vector[td_ns::num_types]; +static int log10_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + log10_strided_dispatch_vector[td_ns::num_types]; + +void populate_log10_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = log10_fn_ns; + + using fn_ns::Log10ContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(log10_contig_dispatch_vector); + + using fn_ns::Log10StridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(log10_strided_dispatch_vector); + + using fn_ns::Log10TypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(log10_output_typeid_vector); +}; + +} // namespace impl + +void init_log10(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_log10_dispatch_vectors(); + using impl::log10_contig_dispatch_vector; + using impl::log10_output_typeid_vector; + using impl::log10_strided_dispatch_vector; + + auto log10_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, log10_output_typeid_vector, + log10_contig_dispatch_vector, log10_strided_dispatch_vector); + }; + m.def("_log10", log10_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto log10_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + log10_output_typeid_vector); + }; + m.def("_log10_result_type", log10_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log10.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/log10.hpp new file mode 100644 index 0000000000..3972695849 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log10.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_log10(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log1p.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log1p.cpp new file mode 100644 index 0000000000..badb474778 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log1p.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "log1p.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/log1p.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U21: ==== LOG1P (x) +namespace impl +{ + +namespace log1p_fn_ns = dpctl::tensor::kernels::log1p; + +static unary_contig_impl_fn_ptr_t + log1p_contig_dispatch_vector[td_ns::num_types]; +static int log1p_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + log1p_strided_dispatch_vector[td_ns::num_types]; + +void populate_log1p_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = log1p_fn_ns; + + using fn_ns::Log1pContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(log1p_contig_dispatch_vector); + + using fn_ns::Log1pStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(log1p_strided_dispatch_vector); + + using fn_ns::Log1pTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(log1p_output_typeid_vector); +}; + +} // namespace impl + +void init_log1p(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_log1p_dispatch_vectors(); + using impl::log1p_contig_dispatch_vector; + using impl::log1p_output_typeid_vector; + using impl::log1p_strided_dispatch_vector; + + auto log1p_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, log1p_output_typeid_vector, + log1p_contig_dispatch_vector, log1p_strided_dispatch_vector); + }; + m.def("_log1p", log1p_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto log1p_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + log1p_output_typeid_vector); + }; + m.def("_log1p_result_type", log1p_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log1p.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/log1p.hpp new file mode 100644 index 0000000000..438b93601c --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log1p.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_log1p(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log2.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log2.cpp new file mode 100644 index 0000000000..b5a8a39684 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log2.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "log2.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/log2.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U22: ==== LOG2 (x) +namespace impl +{ + +namespace log2_fn_ns = dpctl::tensor::kernels::log2; + +static unary_contig_impl_fn_ptr_t log2_contig_dispatch_vector[td_ns::num_types]; +static int log2_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + log2_strided_dispatch_vector[td_ns::num_types]; + +void populate_log2_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = log2_fn_ns; + + using fn_ns::Log2ContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(log2_contig_dispatch_vector); + + using fn_ns::Log2StridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(log2_strided_dispatch_vector); + + using fn_ns::Log2TypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(log2_output_typeid_vector); +}; + +} // namespace impl + +void init_log2(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_log2_dispatch_vectors(); + using impl::log2_contig_dispatch_vector; + using impl::log2_output_typeid_vector; + using impl::log2_strided_dispatch_vector; + + auto log2_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, log2_output_typeid_vector, + log2_contig_dispatch_vector, log2_strided_dispatch_vector); + }; + m.def("_log2", log2_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto log2_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, log2_output_typeid_vector); + }; + m.def("_log2_result_type", log2_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log2.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/log2.hpp new file mode 100644 index 0000000000..4e47ed369a --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log2.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_log2(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.cpp new file mode 100644 index 0000000000..77ded230be --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "logaddexp.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/logaddexp.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B15: ===== LOGADDEXP (x1, x2) +namespace impl +{ +namespace logaddexp_fn_ns = dpctl::tensor::kernels::logaddexp; + +static binary_contig_impl_fn_ptr_t + logaddexp_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int logaddexp_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + logaddexp_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_logaddexp_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = logaddexp_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::LogAddExpTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(logaddexp_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::LogAddExpStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(logaddexp_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::LogAddExpContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(logaddexp_contig_dispatch_table); +}; + +} // namespace impl + +void init_logaddexp(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_logaddexp_dispatch_tables(); + using impl::logaddexp_contig_dispatch_table; + using impl::logaddexp_output_id_table; + using impl::logaddexp_strided_dispatch_table; + + auto logaddexp_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, logaddexp_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + logaddexp_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + logaddexp_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto logaddexp_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + logaddexp_output_id_table); + }; + m.def("_logaddexp", logaddexp_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_logaddexp_result_type", logaddexp_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.hpp new file mode 100644 index 0000000000..6601b3f9c5 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_logaddexp(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.cpp new file mode 100644 index 0000000000..4c573ce508 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "logical_and.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/logical_and.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B16: ===== LOGICAL_AND (x1, x2) +namespace impl +{ +namespace logical_and_fn_ns = dpctl::tensor::kernels::logical_and; + +static binary_contig_impl_fn_ptr_t + logical_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int logical_and_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + logical_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_logical_and_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = logical_and_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::LogicalAndTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(logical_and_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::LogicalAndStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(logical_and_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::LogicalAndContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(logical_and_contig_dispatch_table); +}; + +} // namespace impl + +void init_logical_and(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_logical_and_dispatch_tables(); + using impl::logical_and_contig_dispatch_table; + using impl::logical_and_output_id_table; + using impl::logical_and_strided_dispatch_table; + + auto logical_and_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, logical_and_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + logical_and_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + logical_and_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto logical_and_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + logical_and_output_id_table); + }; + m.def("_logical_and", logical_and_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_logical_and_result_type", logical_and_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.hpp new file mode 100644 index 0000000000..ee73f7c8d5 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_logical_and(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.cpp new file mode 100644 index 0000000000..84362cd9ce --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.cpp @@ -0,0 +1,123 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "logical_not.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/logical_not.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U24: ==== LOGICAL_NOT (x) +namespace impl +{ + +namespace logical_not_fn_ns = dpctl::tensor::kernels::logical_not; + +static unary_contig_impl_fn_ptr_t + logical_not_contig_dispatch_vector[td_ns::num_types]; +static int logical_not_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + logical_not_strided_dispatch_vector[td_ns::num_types]; + +void populate_logical_not_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = logical_not_fn_ns; + + using fn_ns::LogicalNotContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(logical_not_contig_dispatch_vector); + + using fn_ns::LogicalNotStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(logical_not_strided_dispatch_vector); + + using fn_ns::LogicalNotTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(logical_not_output_typeid_vector); +}; + +} // namespace impl + +void init_logical_not(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_logical_not_dispatch_vectors(); + using impl::logical_not_contig_dispatch_vector; + using impl::logical_not_output_typeid_vector; + using impl::logical_not_strided_dispatch_vector; + + auto logical_not_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + logical_not_output_typeid_vector, + logical_not_contig_dispatch_vector, + logical_not_strided_dispatch_vector); + }; + m.def("_logical_not", logical_not_pyapi, "", py::arg("src"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + auto logical_not_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + logical_not_output_typeid_vector); + }; + m.def("_logical_not_result_type", logical_not_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.hpp new file mode 100644 index 0000000000..c1a2c393aa --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_logical_not(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.cpp new file mode 100644 index 0000000000..ebf8251b2e --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "logical_or.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/logical_or.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B17: ===== LOGICAL_OR (x1, x2) +namespace impl +{ +namespace logical_or_fn_ns = dpctl::tensor::kernels::logical_or; + +static binary_contig_impl_fn_ptr_t + logical_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int logical_or_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + logical_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_logical_or_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = logical_or_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::LogicalOrTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(logical_or_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::LogicalOrStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(logical_or_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::LogicalOrContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(logical_or_contig_dispatch_table); +}; + +} // namespace impl + +void init_logical_or(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_logical_or_dispatch_tables(); + using impl::logical_or_contig_dispatch_table; + using impl::logical_or_output_id_table; + using impl::logical_or_strided_dispatch_table; + + auto logical_or_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, logical_or_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + logical_or_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + logical_or_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto logical_or_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + logical_or_output_id_table); + }; + m.def("_logical_or", logical_or_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_logical_or_result_type", logical_or_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.hpp new file mode 100644 index 0000000000..00a4ddfcc2 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_logical_xor(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.cpp new file mode 100644 index 0000000000..9488a5615a --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "logical_xor.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/logical_xor.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B18: ===== LOGICAL_XOR (x1, x2) +namespace impl +{ +namespace logical_xor_fn_ns = dpctl::tensor::kernels::logical_xor; + +static binary_contig_impl_fn_ptr_t + logical_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int logical_xor_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + logical_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_logical_xor_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = logical_xor_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::LogicalXorTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(logical_xor_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::LogicalXorStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(logical_xor_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::LogicalXorContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(logical_xor_contig_dispatch_table); +}; + +} // namespace impl + +void init_logical_xor(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_logical_xor_dispatch_tables(); + using impl::logical_xor_contig_dispatch_table; + using impl::logical_xor_output_id_table; + using impl::logical_xor_strided_dispatch_table; + + auto logical_xor_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, logical_xor_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + logical_xor_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + logical_xor_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto logical_xor_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + logical_xor_output_id_table); + }; + m.def("_logical_xor", logical_xor_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_logical_xor_result_type", logical_xor_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.hpp new file mode 100644 index 0000000000..ad069eb120 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_logical_or(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/maximum.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/maximum.cpp new file mode 100644 index 0000000000..208bdcf47f --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/maximum.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "maximum.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/maximum.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B26: ===== MAXIMUM (x1, x2) +namespace impl +{ +namespace maximum_fn_ns = dpctl::tensor::kernels::maximum; + +static binary_contig_impl_fn_ptr_t + maximum_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int maximum_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + maximum_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_maximum_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = maximum_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::MaximumTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(maximum_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::MaximumStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(maximum_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::MaximumContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(maximum_contig_dispatch_table); +}; + +} // namespace impl + +void init_maximum(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_maximum_dispatch_tables(); + using impl::maximum_contig_dispatch_table; + using impl::maximum_output_id_table; + using impl::maximum_strided_dispatch_table; + + auto maximum_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, maximum_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + maximum_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + maximum_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto maximum_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + maximum_output_id_table); + }; + m.def("_maximum", maximum_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_maximum_result_type", maximum_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/maximum.hpp new file mode 100644 index 0000000000..0f49850567 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/maximum.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_maximum(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/minimum.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/minimum.cpp new file mode 100644 index 0000000000..dc1a826ac4 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/minimum.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "minimum.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/minimum.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B27: ===== MINIMUM (x1, x2) +namespace impl +{ +namespace minimum_fn_ns = dpctl::tensor::kernels::minimum; + +static binary_contig_impl_fn_ptr_t + minimum_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int minimum_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + minimum_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_minimum_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = minimum_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::MinimumTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(minimum_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::MinimumStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(minimum_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::MinimumContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(minimum_contig_dispatch_table); +}; + +} // namespace impl + +void init_minimum(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_minimum_dispatch_tables(); + using impl::minimum_contig_dispatch_table; + using impl::minimum_output_id_table; + using impl::minimum_strided_dispatch_table; + + auto minimum_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, minimum_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + minimum_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + minimum_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto minimum_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + minimum_output_id_table); + }; + m.def("_minimum", minimum_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_minimum_result_type", minimum_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/minimum.hpp new file mode 100644 index 0000000000..f1f2467c1e --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/minimum.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_minimum(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/multiply.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/multiply.cpp new file mode 100644 index 0000000000..c087abd9ff --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/multiply.cpp @@ -0,0 +1,230 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "multiply.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" +#include "kernels/elementwise_functions/multiply.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B19: ===== MULTIPLY (x1, x2) +namespace impl +{ + +namespace multiply_fn_ns = dpctl::tensor::kernels::multiply; + +static binary_contig_impl_fn_ptr_t + multiply_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int multiply_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + multiply_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +// mul(matrix, row) +static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t + multiply_contig_matrix_contig_row_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +// mul(row, matrix) +static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t + multiply_contig_row_contig_matrix_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + multiply_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + multiply_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t + multiply_inplace_row_matrix_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_multiply_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = multiply_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::MultiplyTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(multiply_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::MultiplyStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(multiply_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::MultiplyContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(multiply_contig_dispatch_table); + + // function pointers for operation on contiguous matrix, contiguous row + // with contiguous matrix output + using fn_ns::MultiplyContigMatrixContigRowBroadcastFactory; + DispatchTableBuilder< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, + MultiplyContigMatrixContigRowBroadcastFactory, num_types> + dtb4; + dtb4.populate_dispatch_table( + multiply_contig_matrix_contig_row_broadcast_dispatch_table); + + // function pointers for operation on contiguous row, contiguous matrix + // with contiguous matrix output + using fn_ns::MultiplyContigRowContigMatrixBroadcastFactory; + DispatchTableBuilder< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, + MultiplyContigRowContigMatrixBroadcastFactory, num_types> + dtb5; + dtb5.populate_dispatch_table( + multiply_contig_row_contig_matrix_broadcast_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::MultiplyInplaceStridedFactory; + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(multiply_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::MultiplyInplaceContigFactory; + DispatchTableBuilder + dtb7; + dtb7.populate_dispatch_table(multiply_inplace_contig_dispatch_table); + + // function pointers for inplace operation on contiguous matrix + // and contiguous row + using fn_ns::MultiplyInplaceRowMatrixBroadcastFactory; + DispatchTableBuilder + dtb8; + dtb8.populate_dispatch_table(multiply_inplace_row_matrix_dispatch_table); +}; + +} // namespace impl + +void init_multiply(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_multiply_dispatch_tables(); + using impl::multiply_contig_dispatch_table; + using impl::multiply_contig_matrix_contig_row_broadcast_dispatch_table; + using impl::multiply_contig_row_contig_matrix_broadcast_dispatch_table; + using impl::multiply_output_id_table; + using impl::multiply_strided_dispatch_table; + + auto multiply_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, multiply_output_id_table, + // function pointers to handle operation on contiguous + // arrays (pointers may be nullptr) + multiply_contig_dispatch_table, + // function pointers to handle operation on strided arrays + // (most general case) + multiply_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix + // and c-contig row with broadcasting (may be nullptr) + multiply_contig_matrix_contig_row_broadcast_dispatch_table, + // function pointers to handle operation of c-contig matrix + // and c-contig row with broadcasting (may be nullptr) + multiply_contig_row_contig_matrix_broadcast_dispatch_table); + }; + auto multiply_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + multiply_output_id_table); + }; + m.def("_multiply", multiply_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_multiply_result_type", multiply_result_type_pyapi, ""); + + using impl::multiply_inplace_contig_dispatch_table; + using impl::multiply_inplace_row_matrix_dispatch_table; + using impl::multiply_inplace_strided_dispatch_table; + + auto multiply_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, multiply_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + multiply_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + multiply_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + multiply_inplace_row_matrix_dispatch_table); + }; + m.def("_multiply_inplace", multiply_inplace_pyapi, "", py::arg("lhs"), + py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/multiply.hpp new file mode 100644 index 0000000000..e110ecbb20 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/multiply.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_multiply(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/negative.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/negative.cpp new file mode 100644 index 0000000000..bc659506d1 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/negative.cpp @@ -0,0 +1,122 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "negative.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/negative.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U25: ==== NEGATIVE (x) +namespace impl +{ + +namespace negative_fn_ns = dpctl::tensor::kernels::negative; + +static unary_contig_impl_fn_ptr_t + negative_contig_dispatch_vector[td_ns::num_types]; +static int negative_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + negative_strided_dispatch_vector[td_ns::num_types]; + +void populate_negative_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = negative_fn_ns; + + using fn_ns::NegativeContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(negative_contig_dispatch_vector); + + using fn_ns::NegativeStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(negative_strided_dispatch_vector); + + using fn_ns::NegativeTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(negative_output_typeid_vector); +}; + +} // namespace impl + +void init_negative(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_negative_dispatch_vectors(); + using impl::negative_contig_dispatch_vector; + using impl::negative_output_typeid_vector; + using impl::negative_strided_dispatch_vector; + + auto negative_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + negative_output_typeid_vector, + negative_contig_dispatch_vector, + negative_strided_dispatch_vector); + }; + m.def("_negative", negative_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto negative_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + negative_output_typeid_vector); + }; + m.def("_negative_result_type", negative_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/negative.hpp new file mode 100644 index 0000000000..048e481b34 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/negative.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_negative(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.cpp new file mode 100644 index 0000000000..a7a3e909cb --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "not_equal.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/not_equal.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B20: ===== NOT_EQUAL (x1, x2) +namespace impl +{ +namespace not_equal_fn_ns = dpctl::tensor::kernels::not_equal; + +static binary_contig_impl_fn_ptr_t + not_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int not_equal_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + not_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_not_equal_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = not_equal_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::NotEqualTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(not_equal_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::NotEqualStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(not_equal_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::NotEqualContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(not_equal_contig_dispatch_table); +}; + +} // namespace impl + +void init_not_equal(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_not_equal_dispatch_tables(); + using impl::not_equal_contig_dispatch_table; + using impl::not_equal_output_id_table; + using impl::not_equal_strided_dispatch_table; + + auto not_equal_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, not_equal_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + not_equal_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + not_equal_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto not_equal_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + not_equal_output_id_table); + }; + m.def("_not_equal", not_equal_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_not_equal_result_type", not_equal_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.hpp new file mode 100644 index 0000000000..4e1f654e79 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_not_equal(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/positive.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/positive.cpp new file mode 100644 index 0000000000..eaff0794d2 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/positive.cpp @@ -0,0 +1,122 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "positive.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/positive.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U26: ==== POSITIVE (x) +namespace impl +{ + +namespace positive_fn_ns = dpctl::tensor::kernels::positive; + +static unary_contig_impl_fn_ptr_t + positive_contig_dispatch_vector[td_ns::num_types]; +static int positive_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + positive_strided_dispatch_vector[td_ns::num_types]; + +void populate_positive_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = positive_fn_ns; + + using fn_ns::PositiveContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(positive_contig_dispatch_vector); + + using fn_ns::PositiveStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(positive_strided_dispatch_vector); + + using fn_ns::PositiveTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(positive_output_typeid_vector); +}; + +} // namespace impl + +void init_positive(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_positive_dispatch_vectors(); + using impl::positive_contig_dispatch_vector; + using impl::positive_output_typeid_vector; + using impl::positive_strided_dispatch_vector; + + auto positive_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + positive_output_typeid_vector, + positive_contig_dispatch_vector, + positive_strided_dispatch_vector); + }; + m.def("_positive", positive_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto positive_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + positive_output_typeid_vector); + }; + m.def("_positive_result_type", positive_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/positive.hpp new file mode 100644 index 0000000000..a7b19a07ab --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/positive.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_positive(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/pow.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/pow.cpp new file mode 100644 index 0000000000..a8ef6cb171 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/pow.cpp @@ -0,0 +1,189 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "pow.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" +#include "kernels/elementwise_functions/pow.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B21: ===== POW (x1, x2) +namespace impl +{ + +namespace pow_fn_ns = dpctl::tensor::kernels::pow; + +static binary_contig_impl_fn_ptr_t pow_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static int pow_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + pow_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + pow_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + pow_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_pow_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = pow_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::PowTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(pow_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::PowStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(pow_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::PowContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(pow_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::PowInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(pow_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::PowInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(pow_inplace_contig_dispatch_table); +}; + +} // namespace impl + +void init_pow(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_pow_dispatch_tables(); + using impl::pow_contig_dispatch_table; + using impl::pow_output_id_table; + using impl::pow_strided_dispatch_table; + + auto pow_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, pow_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + pow_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + pow_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto pow_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + pow_output_id_table); + }; + m.def("_pow", pow_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_pow_result_type", pow_result_type_pyapi, ""); + + using impl::pow_inplace_contig_dispatch_table; + using impl::pow_inplace_strided_dispatch_table; + + auto pow_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, pow_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + pow_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + pow_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_pow_inplace", pow_inplace_pyapi, "", py::arg("lhs"), + py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/pow.hpp new file mode 100644 index 0000000000..7a13b414eb --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/pow.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_pow(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/proj.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/proj.cpp new file mode 100644 index 0000000000..60060084e1 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/proj.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "proj.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/proj.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U40: ==== PROJ (x) +namespace impl +{ + +namespace proj_fn_ns = dpctl::tensor::kernels::proj; + +static unary_contig_impl_fn_ptr_t proj_contig_dispatch_vector[td_ns::num_types]; +static int proj_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + proj_strided_dispatch_vector[td_ns::num_types]; + +void populate_proj_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = proj_fn_ns; + + using fn_ns::ProjContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(proj_contig_dispatch_vector); + + using fn_ns::ProjStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(proj_strided_dispatch_vector); + + using fn_ns::ProjTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(proj_output_typeid_vector); +}; + +} // namespace impl + +void init_proj(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_proj_dispatch_vectors(); + using impl::proj_contig_dispatch_vector; + using impl::proj_output_typeid_vector; + using impl::proj_strided_dispatch_vector; + + auto proj_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, proj_output_typeid_vector, + proj_contig_dispatch_vector, proj_strided_dispatch_vector); + }; + m.def("_proj", proj_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto proj_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, proj_output_typeid_vector); + }; + m.def("_proj_result_type", proj_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/proj.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/proj.hpp new file mode 100644 index 0000000000..efbe751455 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/proj.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_proj(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/real.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/real.cpp new file mode 100644 index 0000000000..890a308a4e --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/real.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "real.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/real.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U27: ==== REAL (x) +namespace impl +{ + +namespace real_fn_ns = dpctl::tensor::kernels::real; + +static unary_contig_impl_fn_ptr_t real_contig_dispatch_vector[td_ns::num_types]; +static int real_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + real_strided_dispatch_vector[td_ns::num_types]; + +void populate_real_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = real_fn_ns; + + using fn_ns::RealContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(real_contig_dispatch_vector); + + using fn_ns::RealStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(real_strided_dispatch_vector); + + using fn_ns::RealTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(real_output_typeid_vector); +}; + +} // namespace impl + +void init_real(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_real_dispatch_vectors(); + using impl::real_contig_dispatch_vector; + using impl::real_output_typeid_vector; + using impl::real_strided_dispatch_vector; + + auto real_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, real_output_typeid_vector, + real_contig_dispatch_vector, real_strided_dispatch_vector); + }; + m.def("_real", real_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto real_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, real_output_typeid_vector); + }; + m.def("_real_result_type", real_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/real.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/real.hpp new file mode 100644 index 0000000000..b380632448 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/real.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_real(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/remainder.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/remainder.cpp new file mode 100644 index 0000000000..3255ea7e7f --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/remainder.cpp @@ -0,0 +1,190 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "remainder.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" +#include "kernels/elementwise_functions/remainder.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B22: ===== REMAINDER (x1, x2) +namespace impl +{ + +namespace remainder_fn_ns = dpctl::tensor::kernels::remainder; + +static binary_contig_impl_fn_ptr_t + remainder_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int remainder_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + remainder_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + remainder_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + remainder_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_remainder_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = remainder_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::RemainderTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(remainder_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::RemainderStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(remainder_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::RemainderContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(remainder_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::RemainderInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(remainder_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::RemainderInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(remainder_inplace_contig_dispatch_table); +} + +} // namespace impl + +void init_remainder(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_remainder_dispatch_tables(); + using impl::remainder_contig_dispatch_table; + using impl::remainder_output_id_table; + using impl::remainder_strided_dispatch_table; + + auto remainder_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, remainder_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + remainder_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + remainder_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto remainder_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + remainder_output_id_table); + }; + m.def("_remainder", remainder_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_remainder_result_type", remainder_result_type_pyapi, ""); + + using impl::remainder_inplace_contig_dispatch_table; + using impl::remainder_inplace_strided_dispatch_table; + + auto remainder_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, remainder_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + remainder_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + remainder_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_remainder_inplace", remainder_inplace_pyapi, "", py::arg("lhs"), + py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/remainder.hpp new file mode 100644 index 0000000000..ef538547a8 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/remainder.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_remainder(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/round.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/round.cpp new file mode 100644 index 0000000000..cce730b899 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/round.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "round.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/round.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U28: ==== ROUND (x) +namespace impl +{ + +namespace round_fn_ns = dpctl::tensor::kernels::round; + +static unary_contig_impl_fn_ptr_t + round_contig_dispatch_vector[td_ns::num_types]; +static int round_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + round_strided_dispatch_vector[td_ns::num_types]; + +void populate_round_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = round_fn_ns; + + using fn_ns::RoundContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(round_contig_dispatch_vector); + + using fn_ns::RoundStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(round_strided_dispatch_vector); + + using fn_ns::RoundTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(round_output_typeid_vector); +}; + +} // namespace impl + +void init_round(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_round_dispatch_vectors(); + using impl::round_contig_dispatch_vector; + using impl::round_output_typeid_vector; + using impl::round_strided_dispatch_vector; + + auto round_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, round_output_typeid_vector, + round_contig_dispatch_vector, round_strided_dispatch_vector); + }; + m.def("_round", round_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto round_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + round_output_typeid_vector); + }; + m.def("_round_result_type", round_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/round.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/round.hpp new file mode 100644 index 0000000000..5753ef233b --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/round.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_round(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.cpp new file mode 100644 index 0000000000..4661fdfa48 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "rsqrt.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/rsqrt.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U39: ==== RSQRT (x) +namespace impl +{ + +namespace rsqrt_fn_ns = dpctl::tensor::kernels::rsqrt; + +static unary_contig_impl_fn_ptr_t + rsqrt_contig_dispatch_vector[td_ns::num_types]; +static int rsqrt_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + rsqrt_strided_dispatch_vector[td_ns::num_types]; + +void populate_rsqrt_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = rsqrt_fn_ns; + + using fn_ns::RsqrtContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(rsqrt_contig_dispatch_vector); + + using fn_ns::RsqrtStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(rsqrt_strided_dispatch_vector); + + using fn_ns::RsqrtTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(rsqrt_output_typeid_vector); +}; + +} // namespace impl + +void init_rsqrt(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_rsqrt_dispatch_vectors(); + using impl::rsqrt_contig_dispatch_vector; + using impl::rsqrt_output_typeid_vector; + using impl::rsqrt_strided_dispatch_vector; + + auto rsqrt_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, rsqrt_output_typeid_vector, + rsqrt_contig_dispatch_vector, rsqrt_strided_dispatch_vector); + }; + m.def("_rsqrt", rsqrt_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto rsqrt_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + rsqrt_output_typeid_vector); + }; + m.def("_rsqrt_result_type", rsqrt_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.hpp new file mode 100644 index 0000000000..50efc16d79 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_rsqrt(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sign.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sign.cpp new file mode 100644 index 0000000000..7b7c2c22e5 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sign.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "sign.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/sign.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U29: ==== SIGN (x) +namespace impl +{ + +namespace sign_fn_ns = dpctl::tensor::kernels::sign; + +static unary_contig_impl_fn_ptr_t sign_contig_dispatch_vector[td_ns::num_types]; +static int sign_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + sign_strided_dispatch_vector[td_ns::num_types]; + +void populate_sign_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = sign_fn_ns; + + using fn_ns::SignContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(sign_contig_dispatch_vector); + + using fn_ns::SignStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(sign_strided_dispatch_vector); + + using fn_ns::SignTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(sign_output_typeid_vector); +}; + +} // namespace impl + +void init_sign(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_sign_dispatch_vectors(); + using impl::sign_contig_dispatch_vector; + using impl::sign_output_typeid_vector; + using impl::sign_strided_dispatch_vector; + + auto sign_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, sign_output_typeid_vector, + sign_contig_dispatch_vector, sign_strided_dispatch_vector); + }; + m.def("_sign", sign_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto sign_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, sign_output_typeid_vector); + }; + m.def("_sign_result_type", sign_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/sign.hpp new file mode 100644 index 0000000000..fa01370842 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sign.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_sign(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/signbit.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/signbit.cpp new file mode 100644 index 0000000000..fc101dd64b --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/signbit.cpp @@ -0,0 +1,122 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "signbit.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/signbit.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U41: ==== SIGNBIT (x) +namespace impl +{ + +namespace signbit_fn_ns = dpctl::tensor::kernels::signbit; + +static unary_contig_impl_fn_ptr_t + signbit_contig_dispatch_vector[td_ns::num_types]; +static int signbit_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + signbit_strided_dispatch_vector[td_ns::num_types]; + +void populate_signbit_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = signbit_fn_ns; + + using fn_ns::SignbitContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(signbit_contig_dispatch_vector); + + using fn_ns::SignbitStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(signbit_strided_dispatch_vector); + + using fn_ns::SignbitTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(signbit_output_typeid_vector); +}; + +} // namespace impl + +void init_signbit(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_signbit_dispatch_vectors(); + using impl::signbit_contig_dispatch_vector; + using impl::signbit_output_typeid_vector; + using impl::signbit_strided_dispatch_vector; + + auto signbit_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + signbit_output_typeid_vector, + signbit_contig_dispatch_vector, + signbit_strided_dispatch_vector); + }; + m.def("_signbit", signbit_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto signbit_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + signbit_output_typeid_vector); + }; + m.def("_signbit_result_type", signbit_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/signbit.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/signbit.hpp new file mode 100644 index 0000000000..85054bb4de --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/signbit.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_signbit(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sin.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sin.cpp new file mode 100644 index 0000000000..415dc15133 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sin.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "sin.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/sin.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U30: ==== SIN (x) +namespace impl +{ + +namespace sin_fn_ns = dpctl::tensor::kernels::sin; + +static unary_contig_impl_fn_ptr_t sin_contig_dispatch_vector[td_ns::num_types]; +static int sin_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + sin_strided_dispatch_vector[td_ns::num_types]; + +void populate_sin_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = sin_fn_ns; + + using fn_ns::SinContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(sin_contig_dispatch_vector); + + using fn_ns::SinStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(sin_strided_dispatch_vector); + + using fn_ns::SinTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(sin_output_typeid_vector); +}; + +} // namespace impl + +void init_sin(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_sin_dispatch_vectors(); + using impl::sin_contig_dispatch_vector; + using impl::sin_output_typeid_vector; + using impl::sin_strided_dispatch_vector; + + auto sin_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, sin_output_typeid_vector, + sin_contig_dispatch_vector, sin_strided_dispatch_vector); + }; + m.def("_sin", sin_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto sin_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, sin_output_typeid_vector); + }; + m.def("_sin_result_type", sin_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/sin.hpp new file mode 100644 index 0000000000..bd03604b16 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sin.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_sin(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sinh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sinh.cpp new file mode 100644 index 0000000000..d9f92eb8f1 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sinh.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "sinh.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/sinh.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U31: ==== SINH (x) +namespace impl +{ + +namespace sinh_fn_ns = dpctl::tensor::kernels::sinh; + +static unary_contig_impl_fn_ptr_t sinh_contig_dispatch_vector[td_ns::num_types]; +static int sinh_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + sinh_strided_dispatch_vector[td_ns::num_types]; + +void populate_sinh_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = sinh_fn_ns; + + using fn_ns::SinhContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(sinh_contig_dispatch_vector); + + using fn_ns::SinhStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(sinh_strided_dispatch_vector); + + using fn_ns::SinhTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(sinh_output_typeid_vector); +}; + +} // namespace impl + +void init_sinh(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_sinh_dispatch_vectors(); + using impl::sinh_contig_dispatch_vector; + using impl::sinh_output_typeid_vector; + using impl::sinh_strided_dispatch_vector; + + auto sinh_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, sinh_output_typeid_vector, + sinh_contig_dispatch_vector, sinh_strided_dispatch_vector); + }; + m.def("_sinh", sinh_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto sinh_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, sinh_output_typeid_vector); + }; + m.def("_sinh_result_type", sinh_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/sinh.hpp new file mode 100644 index 0000000000..fef8ec416a --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sinh.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_sinh(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.cpp new file mode 100644 index 0000000000..159d45b51c --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "sqrt.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/sqrt.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U33: ==== SQRT (x) +namespace impl +{ + +namespace sqrt_fn_ns = dpctl::tensor::kernels::sqrt; + +static unary_contig_impl_fn_ptr_t sqrt_contig_dispatch_vector[td_ns::num_types]; +static int sqrt_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + sqrt_strided_dispatch_vector[td_ns::num_types]; + +void populate_sqrt_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = sqrt_fn_ns; + + using fn_ns::SqrtContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(sqrt_contig_dispatch_vector); + + using fn_ns::SqrtStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(sqrt_strided_dispatch_vector); + + using fn_ns::SqrtTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(sqrt_output_typeid_vector); +}; + +} // namespace impl + +void init_sqrt(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_sqrt_dispatch_vectors(); + using impl::sqrt_contig_dispatch_vector; + using impl::sqrt_output_typeid_vector; + using impl::sqrt_strided_dispatch_vector; + + auto sqrt_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, sqrt_output_typeid_vector, + sqrt_contig_dispatch_vector, sqrt_strided_dispatch_vector); + }; + m.def("_sqrt", sqrt_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto sqrt_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, sqrt_output_typeid_vector); + }; + m.def("_sqrt_result_type", sqrt_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.hpp new file mode 100644 index 0000000000..38ea68635b --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_sqrt(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/square.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/square.cpp new file mode 100644 index 0000000000..184e09c19c --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/square.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "square.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/square.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U32: ==== SQUARE (x) +namespace impl +{ + +namespace square_fn_ns = dpctl::tensor::kernels::square; + +static unary_contig_impl_fn_ptr_t + square_contig_dispatch_vector[td_ns::num_types]; +static int square_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + square_strided_dispatch_vector[td_ns::num_types]; + +void populate_square_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = square_fn_ns; + + using fn_ns::SquareContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(square_contig_dispatch_vector); + + using fn_ns::SquareStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(square_strided_dispatch_vector); + + using fn_ns::SquareTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(square_output_typeid_vector); +}; + +} // namespace impl + +void init_square(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_square_dispatch_vectors(); + using impl::square_contig_dispatch_vector; + using impl::square_output_typeid_vector; + using impl::square_strided_dispatch_vector; + + auto square_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, square_output_typeid_vector, + square_contig_dispatch_vector, square_strided_dispatch_vector); + }; + m.def("_square", square_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto square_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + square_output_typeid_vector); + }; + m.def("_square_result_type", square_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/square.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/square.hpp new file mode 100644 index 0000000000..d8268b728a --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/square.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_square(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/subtract.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/subtract.cpp new file mode 100644 index 0000000000..9703182e7a --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/subtract.cpp @@ -0,0 +1,229 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "subtract.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" +#include "kernels/elementwise_functions/subtract.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B23: ===== SUBTRACT (x1, x2) +namespace impl +{ +namespace subtract_fn_ns = dpctl::tensor::kernels::subtract; + +static binary_contig_impl_fn_ptr_t + subtract_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int subtract_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + subtract_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +// sub(matrix, row) +static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t + subtract_contig_matrix_contig_row_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +// sub(row, matrix) +static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t + subtract_contig_row_contig_matrix_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + subtract_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + subtract_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t + subtract_inplace_row_matrix_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_subtract_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = subtract_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::SubtractTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(subtract_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::SubtractStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(subtract_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::SubtractContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(subtract_contig_dispatch_table); + + // function pointers for operation on contiguous matrix, contiguous row + // with contiguous matrix output + using fn_ns::SubtractContigMatrixContigRowBroadcastFactory; + DispatchTableBuilder< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, + SubtractContigMatrixContigRowBroadcastFactory, num_types> + dtb4; + dtb4.populate_dispatch_table( + subtract_contig_matrix_contig_row_broadcast_dispatch_table); + + // function pointers for operation on contiguous row, contiguous matrix + // with contiguous matrix output + using fn_ns::SubtractContigRowContigMatrixBroadcastFactory; + DispatchTableBuilder< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, + SubtractContigRowContigMatrixBroadcastFactory, num_types> + dtb5; + dtb5.populate_dispatch_table( + subtract_contig_row_contig_matrix_broadcast_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::SubtractInplaceStridedFactory; + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(subtract_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::SubtractInplaceContigFactory; + DispatchTableBuilder + dtb7; + dtb7.populate_dispatch_table(subtract_inplace_contig_dispatch_table); + + // function pointers for inplace operation on contiguous matrix + // and contiguous row + using fn_ns::SubtractInplaceRowMatrixBroadcastFactory; + DispatchTableBuilder + dtb8; + dtb8.populate_dispatch_table(subtract_inplace_row_matrix_dispatch_table); +}; + +} // namespace impl + +void init_subtract(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_subtract_dispatch_tables(); + using impl::subtract_contig_dispatch_table; + using impl::subtract_contig_matrix_contig_row_broadcast_dispatch_table; + using impl::subtract_contig_row_contig_matrix_broadcast_dispatch_table; + using impl::subtract_output_id_table; + using impl::subtract_strided_dispatch_table; + + auto subtract_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, subtract_output_id_table, + // function pointers to handle operation on contiguous + // arrays (pointers may be nullptr) + subtract_contig_dispatch_table, + // function pointers to handle operation on strided arrays + // (most general case) + subtract_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix + // and c-contig row with broadcasting (may be nullptr) + subtract_contig_matrix_contig_row_broadcast_dispatch_table, + // function pointers to handle operation of c-contig matrix + // and c-contig row with broadcasting (may be nullptr) + subtract_contig_row_contig_matrix_broadcast_dispatch_table); + }; + auto subtract_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + subtract_output_id_table); + }; + m.def("_subtract", subtract_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_subtract_result_type", subtract_result_type_pyapi, ""); + + using impl::subtract_inplace_contig_dispatch_table; + using impl::subtract_inplace_row_matrix_dispatch_table; + using impl::subtract_inplace_strided_dispatch_table; + + auto subtract_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, subtract_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + subtract_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + subtract_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + subtract_inplace_row_matrix_dispatch_table); + }; + m.def("_subtract_inplace", subtract_inplace_pyapi, "", py::arg("lhs"), + py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/subtract.hpp new file mode 100644 index 0000000000..0a4d707865 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/subtract.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_subtract(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/tan.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/tan.cpp new file mode 100644 index 0000000000..2f1fbf55f2 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/tan.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "tan.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/tan.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U34: ==== TAN (x) +namespace impl +{ + +namespace tan_fn_ns = dpctl::tensor::kernels::tan; + +static unary_contig_impl_fn_ptr_t tan_contig_dispatch_vector[td_ns::num_types]; +static int tan_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + tan_strided_dispatch_vector[td_ns::num_types]; + +void populate_tan_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = tan_fn_ns; + + using fn_ns::TanContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(tan_contig_dispatch_vector); + + using fn_ns::TanStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(tan_strided_dispatch_vector); + + using fn_ns::TanTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(tan_output_typeid_vector); +}; + +} // namespace impl + +void init_tan(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_tan_dispatch_vectors(); + using impl::tan_contig_dispatch_vector; + using impl::tan_output_typeid_vector; + using impl::tan_strided_dispatch_vector; + + auto tan_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, tan_output_typeid_vector, + tan_contig_dispatch_vector, tan_strided_dispatch_vector); + }; + m.def("_tan", tan_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto tan_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, tan_output_typeid_vector); + }; + m.def("_tan_result_type", tan_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/tan.hpp new file mode 100644 index 0000000000..f89c8b8f6d --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/tan.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_tanh(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/tanh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/tanh.cpp new file mode 100644 index 0000000000..033389e46d --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/tanh.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "tanh.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/tanh.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U35: ==== TANH (x) +namespace impl +{ + +namespace tanh_fn_ns = dpctl::tensor::kernels::tanh; + +static unary_contig_impl_fn_ptr_t tanh_contig_dispatch_vector[td_ns::num_types]; +static int tanh_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + tanh_strided_dispatch_vector[td_ns::num_types]; + +void populate_tanh_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = tanh_fn_ns; + + using fn_ns::TanhContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(tanh_contig_dispatch_vector); + + using fn_ns::TanhStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(tanh_strided_dispatch_vector); + + using fn_ns::TanhTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(tanh_output_typeid_vector); +}; + +} // namespace impl + +void init_tanh(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_tanh_dispatch_vectors(); + using impl::tanh_contig_dispatch_vector; + using impl::tanh_output_typeid_vector; + using impl::tanh_strided_dispatch_vector; + + auto tanh_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, tanh_output_typeid_vector, + tanh_contig_dispatch_vector, tanh_strided_dispatch_vector); + }; + m.def("_tanh", tanh_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto tanh_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, tanh_output_typeid_vector); + }; + m.def("_tanh_result_type", tanh_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/tanh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/tanh.hpp new file mode 100644 index 0000000000..e456182971 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/tanh.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_tan(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp new file mode 100644 index 0000000000..22ad9bf3cb --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp @@ -0,0 +1,241 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "true_divide.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" +#include "kernels/elementwise_functions/true_divide.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B08: ===== DIVIDE (x1, x2) +namespace impl +{ +namespace true_divide_fn_ns = dpctl::tensor::kernels::true_divide; + +static binary_contig_impl_fn_ptr_t + true_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int true_divide_output_id_table[td_ns::num_types][td_ns::num_types]; +static int true_divide_inplace_output_id_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + true_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +// divide(matrix, row) +static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t + true_divide_contig_matrix_contig_row_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +// divide(row, matrix) +static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t + true_divide_contig_row_contig_matrix_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + true_divide_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + true_divide_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t + true_divide_inplace_row_matrix_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_true_divide_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = true_divide_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::TrueDivideTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(true_divide_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::TrueDivideStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(true_divide_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::TrueDivideContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(true_divide_contig_dispatch_table); + + // function pointers for operation on contiguous matrix, contiguous row + // with contiguous matrix output + using fn_ns::TrueDivideContigMatrixContigRowBroadcastFactory; + DispatchTableBuilder< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, + TrueDivideContigMatrixContigRowBroadcastFactory, num_types> + dtb4; + dtb4.populate_dispatch_table( + true_divide_contig_matrix_contig_row_broadcast_dispatch_table); + + // function pointers for operation on contiguous row, contiguous matrix + // with contiguous matrix output + using fn_ns::TrueDivideContigRowContigMatrixBroadcastFactory; + DispatchTableBuilder< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, + TrueDivideContigRowContigMatrixBroadcastFactory, num_types> + dtb5; + dtb5.populate_dispatch_table( + true_divide_contig_row_contig_matrix_broadcast_dispatch_table); + + // which input types are supported, and what is the type of the result + using fn_ns::TrueDivideInplaceTypeMapFactory; + DispatchTableBuilder dtb6; + dtb6.populate_dispatch_table(true_divide_inplace_output_id_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::TrueDivideInplaceStridedFactory; + DispatchTableBuilder + dtb7; + dtb7.populate_dispatch_table(true_divide_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::TrueDivideInplaceContigFactory; + DispatchTableBuilder + dtb8; + dtb8.populate_dispatch_table(true_divide_inplace_contig_dispatch_table); + + // function pointers for inplace operation on contiguous matrix + // and contiguous row + using fn_ns::TrueDivideInplaceRowMatrixBroadcastFactory; + DispatchTableBuilder + dtb9; + dtb9.populate_dispatch_table(true_divide_inplace_row_matrix_dispatch_table); +}; + +} // namespace impl + +void init_divide(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_true_divide_dispatch_tables(); + using impl::true_divide_contig_dispatch_table; + using impl:: + true_divide_contig_matrix_contig_row_broadcast_dispatch_table; + using impl:: + true_divide_contig_row_contig_matrix_broadcast_dispatch_table; + using impl::true_divide_output_id_table; + using impl::true_divide_strided_dispatch_table; + + auto divide_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, true_divide_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + true_divide_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + true_divide_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + true_divide_contig_matrix_contig_row_broadcast_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + true_divide_contig_row_contig_matrix_broadcast_dispatch_table); + }; + auto divide_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + true_divide_output_id_table); + }; + m.def("_divide", divide_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_divide_result_type", divide_result_type_pyapi, ""); + + using impl::true_divide_inplace_contig_dispatch_table; + using impl::true_divide_inplace_output_id_table; + using impl::true_divide_inplace_row_matrix_dispatch_table; + using impl::true_divide_inplace_strided_dispatch_table; + + auto divide_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, true_divide_inplace_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + true_divide_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + true_divide_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + true_divide_inplace_row_matrix_dispatch_table); + }; + m.def("_divide_inplace", divide_inplace_pyapi, "", py::arg("lhs"), + py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.hpp new file mode 100644 index 0000000000..e29b858dae --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_divide(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/trunc.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/trunc.cpp new file mode 100644 index 0000000000..5b2f451fb0 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/trunc.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "trunc.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/trunc.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U36: ==== TRUNC (x) +namespace impl +{ + +namespace trunc_fn_ns = dpctl::tensor::kernels::trunc; + +static unary_contig_impl_fn_ptr_t + trunc_contig_dispatch_vector[td_ns::num_types]; +static int trunc_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + trunc_strided_dispatch_vector[td_ns::num_types]; + +void populate_trunc_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = trunc_fn_ns; + + using fn_ns::TruncContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(trunc_contig_dispatch_vector); + + using fn_ns::TruncStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(trunc_strided_dispatch_vector); + + using fn_ns::TruncTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(trunc_output_typeid_vector); +}; + +} // namespace impl + +void init_trunc(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_trunc_dispatch_vectors(); + using impl::trunc_contig_dispatch_vector; + using impl::trunc_output_typeid_vector; + using impl::trunc_strided_dispatch_vector; + + auto trunc_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, trunc_output_typeid_vector, + trunc_contig_dispatch_vector, trunc_strided_dispatch_vector); + }; + m.def("_trunc", trunc_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto trunc_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + trunc_output_typeid_vector); + }; + m.def("_trunc_result_type", trunc_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/trunc.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/trunc.hpp new file mode 100644 index 0000000000..cc28397f55 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/trunc.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_trunc(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/argmax.cpp b/dpctl/tensor/libtensor/source/reductions/argmax.cpp new file mode 100644 index 0000000000..1d83bf9c2d --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/argmax.cpp @@ -0,0 +1,119 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "reduction_over_axis.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::search_strided_impl_fn_ptr; +static search_strided_impl_fn_ptr + argmax_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::search_contig_impl_fn_ptr; +static search_contig_impl_fn_ptr + argmax_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +using dpctl::tensor::kernels::search_contig_impl_fn_ptr; +static search_contig_impl_fn_ptr + argmax_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_argmax_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::search_strided_impl_fn_ptr; + using td_ns::DispatchTableBuilder; + + using dpctl::tensor::kernels::ArgmaxOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(argmax_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::ArgmaxOverAxis1TempsContigFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(argmax_over_axis1_contig_temps_dispatch_table); + + using dpctl::tensor::kernels::ArgmaxOverAxis0TempsContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(argmax_over_axis0_contig_temps_dispatch_table); +} + +} // namespace impl + +void init_argmax(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_argmax_over_axis_dispatch_tables; + populate_argmax_over_axis_dispatch_tables(); + using impl::argmax_over_axis0_contig_temps_dispatch_table; + using impl::argmax_over_axis1_contig_temps_dispatch_table; + using impl::argmax_over_axis_strided_temps_dispatch_table; + + auto argmax_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_search_over_axis; + return py_search_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + argmax_over_axis_strided_temps_dispatch_table, + argmax_over_axis0_contig_temps_dispatch_table, + argmax_over_axis1_contig_temps_dispatch_table); + }; + m.def("_argmax_over_axis", argmax_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/sum_reductions.hpp b/dpctl/tensor/libtensor/source/reductions/argmax.hpp similarity index 90% rename from dpctl/tensor/libtensor/source/sum_reductions.hpp rename to dpctl/tensor/libtensor/source/reductions/argmax.hpp index ac612ec1f7..9958396b43 100644 --- a/dpctl/tensor/libtensor/source/sum_reductions.hpp +++ b/dpctl/tensor/libtensor/source/reductions/argmax.hpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -23,9 +23,10 @@ //===--------------------------------------------------------------------===// #pragma once -#include #include +namespace py = pybind11; + namespace dpctl { namespace tensor @@ -33,7 +34,7 @@ namespace tensor namespace py_internal { -extern void init_reduction_functions(py::module_ m); +extern void init_argmax(py::module_ m); } // namespace py_internal } // namespace tensor diff --git a/dpctl/tensor/libtensor/source/reductions/argmin.cpp b/dpctl/tensor/libtensor/source/reductions/argmin.cpp new file mode 100644 index 0000000000..c6469e6864 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/argmin.cpp @@ -0,0 +1,119 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "reduction_over_axis.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::search_strided_impl_fn_ptr; +static search_strided_impl_fn_ptr + argmin_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::search_contig_impl_fn_ptr; +static search_contig_impl_fn_ptr + argmin_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +using dpctl::tensor::kernels::search_contig_impl_fn_ptr; +static search_contig_impl_fn_ptr + argmin_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_argmin_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::search_strided_impl_fn_ptr; + using td_ns::DispatchTableBuilder; + + using dpctl::tensor::kernels::ArgminOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(argmin_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::ArgminOverAxis1TempsContigFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(argmin_over_axis1_contig_temps_dispatch_table); + + using dpctl::tensor::kernels::ArgminOverAxis0TempsContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(argmin_over_axis0_contig_temps_dispatch_table); +} + +} // namespace impl + +void init_argmin(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_argmin_over_axis_dispatch_tables; + populate_argmin_over_axis_dispatch_tables(); + using impl::argmin_over_axis0_contig_temps_dispatch_table; + using impl::argmin_over_axis1_contig_temps_dispatch_table; + using impl::argmin_over_axis_strided_temps_dispatch_table; + + auto argmin_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_search_over_axis; + return py_search_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + argmin_over_axis_strided_temps_dispatch_table, + argmin_over_axis0_contig_temps_dispatch_table, + argmin_over_axis1_contig_temps_dispatch_table); + }; + m.def("_argmin_over_axis", argmin_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/argmin.hpp b/dpctl/tensor/libtensor/source/reductions/argmin.hpp new file mode 100644 index 0000000000..ea6ef1931c --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/argmin.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_argmin(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/logsumexp.cpp b/dpctl/tensor/libtensor/source/reductions/logsumexp.cpp new file mode 100644 index 0000000000..e3b015a4e0 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/logsumexp.cpp @@ -0,0 +1,136 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "reduction_over_axis.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + logsumexp_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + logsumexp_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + logsumexp_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_logsumexp_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + using dpctl::tensor::kernels::LogSumExpOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table( + logsumexp_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::LogSumExpOverAxis1TempsContigFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table( + logsumexp_over_axis1_contig_temps_dispatch_table); + + using dpctl::tensor::kernels::LogSumExpOverAxis0TempsContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table( + logsumexp_over_axis0_contig_temps_dispatch_table); +} + +} // namespace impl + +void init_logsumexp(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_logsumexp_over_axis_dispatch_tables; + populate_logsumexp_over_axis_dispatch_tables(); + using impl::logsumexp_over_axis0_contig_temps_dispatch_table; + using impl::logsumexp_over_axis1_contig_temps_dispatch_table; + using impl::logsumexp_over_axis_strided_temps_dispatch_table; + + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + + auto logsumexp_pyapi = [&](const arrayT &src, + int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_tree_reduction_over_axis; + return py_tree_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + logsumexp_over_axis_strided_temps_dispatch_table, + logsumexp_over_axis0_contig_temps_dispatch_table, + logsumexp_over_axis1_contig_temps_dispatch_table); + }; + m.def("_logsumexp_over_axis", logsumexp_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto logsumexp_dtype_supported = [&](const py::dtype &input_dtype, + const py::dtype &output_dtype) { + using dpctl::tensor::py_internal::py_tree_reduction_dtype_supported; + return py_tree_reduction_dtype_supported( + input_dtype, output_dtype, + logsumexp_over_axis_strided_temps_dispatch_table); + }; + m.def("_logsumexp_over_axis_dtype_supported", logsumexp_dtype_supported, + "", py::arg("arg_dtype"), py::arg("out_dtype")); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/logsumexp.hpp b/dpctl/tensor/libtensor/source/reductions/logsumexp.hpp new file mode 100644 index 0000000000..46b2156f46 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/logsumexp.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_logsumexp(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/max.cpp b/dpctl/tensor/libtensor/source/reductions/max.cpp new file mode 100644 index 0000000000..32c60b943b --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/max.cpp @@ -0,0 +1,171 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "utils/type_dispatch.hpp" + +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + max_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + max_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + max_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + max_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + max_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + max_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_max_over_axis_dispatch_tables(void) +{ + using td_ns::DispatchTableBuilder; + + using dpctl::tensor::kernels::MaxOverAxisAtomicStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(max_over_axis_strided_atomic_dispatch_table); + + using dpctl::tensor::kernels::MaxOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(max_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::MaxOverAxis1AtomicContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(max_over_axis1_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::MaxOverAxis0AtomicContigFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(max_over_axis0_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::MaxOverAxis1TempsContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(max_over_axis1_contig_temps_dispatch_table); + + using dpctl::tensor::kernels::MaxOverAxis0TempsContigFactory; + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(max_over_axis0_contig_temps_dispatch_table); +} + +using atomic_support::atomic_support_fn_ptr_t; +static atomic_support_fn_ptr_t max_atomic_support_vector[td_ns::num_types]; + +void populate_max_atomic_support_dispatch_vector(void) +{ + using td_ns::DispatchVectorBuilder; + + using atomic_support::MaxAtomicSupportFactory; + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(max_atomic_support_vector); +} + +} // namespace impl + +void init_max(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_max_over_axis_dispatch_tables; + populate_max_over_axis_dispatch_tables(); + using impl::max_over_axis0_contig_atomic_dispatch_table; + using impl::max_over_axis0_contig_temps_dispatch_table; + using impl::max_over_axis1_contig_atomic_dispatch_table; + using impl::max_over_axis1_contig_temps_dispatch_table; + using impl::max_over_axis_strided_atomic_dispatch_table; + using impl::max_over_axis_strided_temps_dispatch_table; + + using impl::populate_max_atomic_support_dispatch_vector; + populate_max_atomic_support_dispatch_vector(); + using impl::max_atomic_support_vector; + + auto max_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_reduction_over_axis; + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + max_over_axis_strided_atomic_dispatch_table, + max_over_axis0_contig_atomic_dispatch_table, + max_over_axis1_contig_atomic_dispatch_table, + max_over_axis_strided_temps_dispatch_table, + max_over_axis0_contig_temps_dispatch_table, + max_over_axis1_contig_temps_dispatch_table, + max_atomic_support_vector); + }; + m.def("_max_over_axis", max_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/max.hpp b/dpctl/tensor/libtensor/source/reductions/max.hpp new file mode 100644 index 0000000000..05a31fc1fb --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/max.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_max(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/min.cpp b/dpctl/tensor/libtensor/source/reductions/min.cpp new file mode 100644 index 0000000000..de1a81387d --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/min.cpp @@ -0,0 +1,173 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "utils/type_dispatch.hpp" + +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + min_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + min_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + min_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + min_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + min_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + min_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_min_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using td_ns::DispatchTableBuilder; + + using dpctl::tensor::kernels::MinOverAxisAtomicStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(min_over_axis_strided_atomic_dispatch_table); + + using dpctl::tensor::kernels::MinOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(min_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::MinOverAxis1AtomicContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(min_over_axis1_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::MinOverAxis0AtomicContigFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(min_over_axis0_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::MinOverAxis1TempsContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(min_over_axis1_contig_temps_dispatch_table); + + using dpctl::tensor::kernels::MinOverAxis0TempsContigFactory; + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(min_over_axis0_contig_temps_dispatch_table); +} + +using atomic_support::atomic_support_fn_ptr_t; +static atomic_support_fn_ptr_t min_atomic_support_vector[td_ns::num_types]; + +void populate_min_atomic_support_dispatch_vector(void) +{ + using td_ns::DispatchVectorBuilder; + + using atomic_support::MinAtomicSupportFactory; + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(min_atomic_support_vector); +} + +} // namespace impl + +void init_min(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_min_over_axis_dispatch_tables; + populate_min_over_axis_dispatch_tables(); + using impl::min_over_axis0_contig_atomic_dispatch_table; + using impl::min_over_axis0_contig_temps_dispatch_table; + using impl::min_over_axis1_contig_atomic_dispatch_table; + using impl::min_over_axis1_contig_temps_dispatch_table; + using impl::min_over_axis_strided_atomic_dispatch_table; + using impl::min_over_axis_strided_temps_dispatch_table; + + using impl::populate_min_atomic_support_dispatch_vector; + populate_min_atomic_support_dispatch_vector(); + using impl::min_atomic_support_vector; + + auto min_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_reduction_over_axis; + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + min_over_axis_strided_atomic_dispatch_table, + min_over_axis0_contig_atomic_dispatch_table, + min_over_axis1_contig_atomic_dispatch_table, + min_over_axis_strided_temps_dispatch_table, + min_over_axis0_contig_temps_dispatch_table, + min_over_axis1_contig_temps_dispatch_table, + min_atomic_support_vector); + }; + m.def("_min_over_axis", min_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/min.hpp b/dpctl/tensor/libtensor/source/reductions/min.hpp new file mode 100644 index 0000000000..cad94c7533 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/min.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_min(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/prod.cpp b/dpctl/tensor/libtensor/source/reductions/prod.cpp new file mode 100644 index 0000000000..a90d04304a --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/prod.cpp @@ -0,0 +1,187 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "utils/type_dispatch.hpp" + +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + prod_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + prod_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + prod_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + prod_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + prod_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + prod_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_prod_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + using dpctl::tensor::kernels::ProductOverAxisAtomicStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(prod_over_axis_strided_atomic_dispatch_table); + + using dpctl::tensor::kernels::ProductOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(prod_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::ProductOverAxis1AtomicContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(prod_over_axis1_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::ProductOverAxis0AtomicContigFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(prod_over_axis0_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::ProductOverAxis1TempsContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(prod_over_axis1_contig_temps_dispatch_table); + + using dpctl::tensor::kernels::ProductOverAxis0TempsContigFactory; + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(prod_over_axis0_contig_temps_dispatch_table); +} + +using atomic_support::atomic_support_fn_ptr_t; +static atomic_support_fn_ptr_t prod_atomic_support_vector[td_ns::num_types]; + +void populate_prod_atomic_support_dispatch_vector(void) +{ + using td_ns::DispatchVectorBuilder; + + using atomic_support::ProductAtomicSupportFactory; + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(prod_atomic_support_vector); +} + +} // namespace impl + +void init_prod(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_prod_over_axis_dispatch_tables; + populate_prod_over_axis_dispatch_tables(); + using impl::prod_over_axis0_contig_atomic_dispatch_table; + using impl::prod_over_axis0_contig_temps_dispatch_table; + using impl::prod_over_axis1_contig_atomic_dispatch_table; + using impl::prod_over_axis1_contig_temps_dispatch_table; + using impl::prod_over_axis_strided_atomic_dispatch_table; + using impl::prod_over_axis_strided_temps_dispatch_table; + + using impl::populate_prod_atomic_support_dispatch_vector; + populate_prod_atomic_support_dispatch_vector(); + using impl::prod_atomic_support_vector; + + auto prod_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_reduction_over_axis; + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + prod_over_axis_strided_atomic_dispatch_table, + prod_over_axis0_contig_atomic_dispatch_table, + prod_over_axis1_contig_atomic_dispatch_table, + prod_over_axis_strided_temps_dispatch_table, + prod_over_axis0_contig_temps_dispatch_table, + prod_over_axis1_contig_temps_dispatch_table, + prod_atomic_support_vector); + }; + m.def("_prod_over_axis", prod_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto prod_dtype_supported = + [&](const py::dtype &input_dtype, const py::dtype &output_dtype, + const std::string &dst_usm_type, sycl::queue &q) { + using dpctl::tensor::py_internal::py_reduction_dtype_supported; + return py_reduction_dtype_supported( + input_dtype, output_dtype, dst_usm_type, q, + prod_over_axis_strided_atomic_dispatch_table, + prod_over_axis_strided_temps_dispatch_table, + prod_atomic_support_vector); + }; + m.def("_prod_over_axis_dtype_supported", prod_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype"), + py::arg("dst_usm_type"), py::arg("sycl_queue")); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/prod.hpp b/dpctl/tensor/libtensor/source/reductions/prod.hpp new file mode 100644 index 0000000000..026e7d8923 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/prod.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_prod(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/reduce_hypot.cpp b/dpctl/tensor/libtensor/source/reductions/reduce_hypot.cpp new file mode 100644 index 0000000000..c7313930b4 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/reduce_hypot.cpp @@ -0,0 +1,132 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "reduction_over_axis.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + hypot_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + hypot_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + hypot_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_hypot_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + using dpctl::tensor::kernels::HypotOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(hypot_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::HypotOverAxis1TempsContigFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(hypot_over_axis1_contig_temps_dispatch_table); + + using dpctl::tensor::kernels::HypotOverAxis0TempsContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(hypot_over_axis0_contig_temps_dispatch_table); +} + +} // namespace impl + +void init_reduce_hypot(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_hypot_over_axis_dispatch_tables; + populate_hypot_over_axis_dispatch_tables(); + using impl::hypot_over_axis0_contig_temps_dispatch_table; + using impl::hypot_over_axis1_contig_temps_dispatch_table; + using impl::hypot_over_axis_strided_temps_dispatch_table; + + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + + auto hypot_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_tree_reduction_over_axis; + return py_tree_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + hypot_over_axis_strided_temps_dispatch_table, + hypot_over_axis0_contig_temps_dispatch_table, + hypot_over_axis1_contig_temps_dispatch_table); + }; + m.def("_hypot_over_axis", hypot_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto hypot_dtype_supported = [&](const py::dtype &input_dtype, + const py::dtype &output_dtype) { + using dpctl::tensor::py_internal::py_tree_reduction_dtype_supported; + return py_tree_reduction_dtype_supported( + input_dtype, output_dtype, + hypot_over_axis_strided_temps_dispatch_table); + }; + m.def("_hypot_over_axis_dtype_supported", hypot_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype")); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/reduce_hypot.hpp b/dpctl/tensor/libtensor/source/reductions/reduce_hypot.hpp new file mode 100644 index 0000000000..92b7fac363 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/reduce_hypot.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_reduce_hypot(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp new file mode 100644 index 0000000000..2478545efe --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp @@ -0,0 +1,143 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "utils/type_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ +namespace atomic_support +{ + +typedef bool (*atomic_support_fn_ptr_t)(const sycl::queue &, sycl::usm::alloc); + +/*! @brief Function which returns a constant value for atomic support */ +template +bool fixed_decision(const sycl::queue &, sycl::usm::alloc) +{ + return return_value; +} + +/*! @brief Template for querying atomic support for a type on a device */ +template +bool check_atomic_support(const sycl::queue &exec_q, + sycl::usm::alloc usm_alloc_type) +{ + constexpr bool atomic32 = (sizeof(T) == 4); + constexpr bool atomic64 = (sizeof(T) == 8); + using dpctl::tensor::type_utils::is_complex; + if constexpr ((!atomic32 && !atomic64) || is_complex::value) { + return fixed_decision(exec_q, usm_alloc_type); + } + else { + bool supports_atomics = false; + const sycl::device &dev = exec_q.get_device(); + if constexpr (atomic64) { + if (!dev.has(sycl::aspect::atomic64)) { + return false; + } + } + switch (usm_alloc_type) { + case sycl::usm::alloc::shared: + supports_atomics = + dev.has(sycl::aspect::usm_atomic_shared_allocations); + break; + case sycl::usm::alloc::host: + supports_atomics = + dev.has(sycl::aspect::usm_atomic_host_allocations); + break; + case sycl::usm::alloc::device: + supports_atomics = true; + break; + default: + supports_atomics = false; + } + return supports_atomics; + } +} + +template struct ArithmeticAtomicSupportFactory +{ + fnT get() + { + using dpctl::tensor::type_utils::is_complex; + if constexpr (std::is_floating_point_v || + std::is_same_v || is_complex::value) + { + // for real- and complex- floating point types, tree reduction has + // better round-off accumulation properties (round-off error is + // proportional to the log2(reduction_size), while naive elementwise + // summation used by atomic implementation has round-off error + // growing proportional to the reduction_size.), hence reduction + // over floating point types should always use tree_reduction + // algorithm, even though atomic implementation may be applicable + return fixed_decision; + } + else { + return check_atomic_support; + } + } +}; + +template struct MinMaxAtomicSupportFactory +{ + fnT get() + { + return check_atomic_support; + } +}; + +template +struct MaxAtomicSupportFactory : public MinMaxAtomicSupportFactory +{ +}; + +template +struct MinAtomicSupportFactory : public MinMaxAtomicSupportFactory +{ +}; + +template +struct SumAtomicSupportFactory : public ArithmeticAtomicSupportFactory +{ +}; + +template +struct ProductAtomicSupportFactory + : public ArithmeticAtomicSupportFactory +{ +}; + +} // namespace atomic_support +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_common.cpp b/dpctl/tensor/libtensor/source/reductions/reduction_common.cpp new file mode 100644 index 0000000000..99edf663ad --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/reduction_common.cpp @@ -0,0 +1,60 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include + +#include "argmax.hpp" +#include "argmin.hpp" +#include "logsumexp.hpp" +#include "max.hpp" +#include "min.hpp" +#include "prod.hpp" +#include "reduce_hypot.hpp" +#include "sum.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +/*! @brief Add reduction functions to Python module */ +void init_reduction_functions(py::module_ m) +{ + init_argmax(m); + init_argmin(m); + init_logsumexp(m); + init_max(m); + init_min(m); + init_prod(m); + init_reduce_hypot(m); + init_sum(m); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_common.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_common.hpp new file mode 100644 index 0000000000..61c992364a --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/reduction_common.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_reduction_functions(py::module_); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp new file mode 100644 index 0000000000..5aafe38a40 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp @@ -0,0 +1,1099 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for reductions. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +/* ====================== dtype supported ======================== */ + +/*! @brief Template implementing Python API for querying type support by + * reduction which may support atomics */ +template +bool py_reduction_dtype_supported( + const py::dtype &input_dtype, + const py::dtype &output_dtype, + const std::string &dst_usm_type, + sycl::queue &q, + const fnT &atomic_dispatch_table, + const fnT &temps_dispatch_table, + const CheckAtomicSupportFnT &check_atomic_support) +{ + int arg_tn = + input_dtype.num(); // NumPy type numbers are the same as in dpctl + int out_tn = + output_dtype.num(); // NumPy type numbers are the same as in dpctl + int arg_typeid = -1; + int out_typeid = -1; + + auto array_types = td_ns::usm_ndarray_types(); + + try { + arg_typeid = array_types.typenum_to_lookup_id(arg_tn); + out_typeid = array_types.typenum_to_lookup_id(out_tn); + } catch (const std::exception &e) { + throw py::value_error(e.what()); + } + + if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 || + out_typeid >= td_ns::num_types) + { + throw std::runtime_error("Reduction type support check: lookup failed"); + } + + // remove_all_extents gets underlying type of table + using fn_ptrT = typename std::remove_all_extents::type; + fn_ptrT fn = nullptr; + + sycl::usm::alloc kind = sycl::usm::alloc::unknown; + + if (dst_usm_type == "device") { + kind = sycl::usm::alloc::device; + } + else if (dst_usm_type == "shared") { + kind = sycl::usm::alloc::shared; + } + else if (dst_usm_type == "host") { + kind = sycl::usm::alloc::host; + } + else { + throw py::value_error("Unrecognized `dst_usm_type` argument."); + } + + bool supports_atomics = check_atomic_support[out_typeid](q, kind); + + if (supports_atomics) { + fn = atomic_dispatch_table[arg_typeid][out_typeid]; + } + + if (fn == nullptr) { + // use slower reduction implementation using temporaries + fn = temps_dispatch_table[arg_typeid][out_typeid]; + } + + return (fn != nullptr); +} + +/*! @brief Template implementing Python API for querying type support by tree + * reduction */ +template +bool py_tree_reduction_dtype_supported(const py::dtype &input_dtype, + const py::dtype &output_dtype, + const fnT &temps_dispatch_table) +{ + int arg_tn = + input_dtype.num(); // NumPy type numbers are the same as in dpctl + int out_tn = + output_dtype.num(); // NumPy type numbers are the same as in dpctl + int arg_typeid = -1; + int out_typeid = -1; + + auto array_types = td_ns::usm_ndarray_types(); + + try { + arg_typeid = array_types.typenum_to_lookup_id(arg_tn); + out_typeid = array_types.typenum_to_lookup_id(out_tn); + } catch (const std::exception &e) { + throw py::value_error(e.what()); + } + + if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 || + out_typeid >= td_ns::num_types) + { + throw std::runtime_error("Reduction type support check: lookup failed"); + } + + auto fn = temps_dispatch_table[arg_typeid][out_typeid]; + + return (fn != nullptr); +} + +/* ==================== Generic reductions ====================== */ + +/*! @brief Template implementing Python API for reduction over axis which may + * support atomics */ +template +std::pair py_reduction_over_axis( + const dpctl::tensor::usm_ndarray &src, + int trailing_dims_to_reduce, // comp over this many trailing indexes + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const strided_fnT &atomic_dispatch_table, + const contig_fnT &axis0_atomic_dispatch_table, + const contig_fnT &axis1_atomic_dispatch_table, + const strided_fnT &temps_dispatch_table, + const contig_fnT &axis0_temps_dispatch_table, + const contig_fnT &axis1_temps_dispatch_table, + const SupportAtomicFnT &check_atomic_support) +{ + int src_nd = src.get_ndim(); + int iteration_nd = src_nd - trailing_dims_to_reduce; + if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_reduce must be positive, but no " + "greater than rank of the array being reduced"); + } + + int dst_nd = dst.get_ndim(); + if (dst_nd != iteration_nd) { + throw py::value_error("Destination array rank does not match input " + "array rank and number of reduced dimensions"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + for (int i = 0; same_shapes && (i < dst_nd); ++i) { + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); + } + + if (!same_shapes) { + throw py::value_error("Destination shape does not match unreduced " + "dimensions of the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + size_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + size_t reduction_nelems(1); + for (int i = dst_nd; i < src_nd; ++i) { + reduction_nelems *= static_cast(src_shape_ptr[i]); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + // destination must be ample enough to accommodate all elements + { + auto dst_offsets = dst.get_minmax_offsets(); + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < dst_nelems) { + throw py::value_error( + "Destination array can not accommodate all the " + "elements of source array."); + } + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + namespace td_ns = dpctl::tensor::type_dispatch; + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + void *data_ptr = dst.get_data(); + const auto &ctx = exec_q.get_context(); + auto usm_type = sycl::get_pointer_type(data_ptr, ctx); + + bool supports_atomics = check_atomic_support[dst_typeid](exec_q, usm_type); + + // handle special case when both reduction and iteration are 1D contiguous + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + if ((is_src_c_contig && is_dst_c_contig) || + (is_src_f_contig && dst_nelems == 1)) + { + // remove_all_extents gets underlying type of table + using contig_fn_ptr_T = + typename std::remove_all_extents::type; + contig_fn_ptr_T fn; + if (supports_atomics) { + fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid]; + } + else { + fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + } + if (fn != nullptr) { + size_t iter_nelems = dst_nelems; + + constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + else if (is_src_f_contig && + ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) + { + // remove_all_extents gets underlying type of table + using contig_fn_ptr_T = + typename std::remove_all_extents::type; + contig_fn_ptr_T fn; + if (supports_atomics) { + fn = axis0_atomic_dispatch_table[src_typeid][dst_typeid]; + } + else { + fn = axis0_temps_dispatch_table[src_typeid][dst_typeid]; + } + if (fn != nullptr) { + size_t iter_nelems = dst_nelems; + + constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + + using dpctl::tensor::py_internal::simplify_iteration_space; + using dpctl::tensor::py_internal::simplify_iteration_space_1; + + auto const &src_shape_vecs = src.get_shape_vector(); + auto const &src_strides_vecs = src.get_strides_vector(); + auto const &dst_strides_vecs = dst.get_strides_vector(); + + int reduction_nd = trailing_dims_to_reduce; + const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd; + using shT = std::vector; + shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd, + std::end(src_strides_vecs)); + + shT simplified_reduction_shape; + shT simplified_reduction_src_strides; + py::ssize_t reduction_src_offset(0); + + simplify_iteration_space_1( + reduction_nd, reduction_shape_ptr, reduction_src_strides, + // output + simplified_reduction_shape, simplified_reduction_src_strides, + reduction_src_offset); + + const py::ssize_t *iteration_shape_ptr = src_shape_ptr; + + shT iteration_src_strides(std::begin(src_strides_vecs), + std::begin(src_strides_vecs) + iteration_nd); + shT const &iteration_dst_strides = dst_strides_vecs; + + shT simplified_iteration_shape; + shT simplified_iteration_src_strides; + shT simplified_iteration_dst_strides; + py::ssize_t iteration_src_offset(0); + py::ssize_t iteration_dst_offset(0); + + if (iteration_nd == 0) { + if (dst_nelems != 1) { + throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); + } + iteration_nd = 1; + simplified_iteration_shape.push_back(1); + simplified_iteration_src_strides.push_back(0); + simplified_iteration_dst_strides.push_back(0); + } + else { + simplify_iteration_space(iteration_nd, iteration_shape_ptr, + iteration_src_strides, iteration_dst_strides, + // output + simplified_iteration_shape, + simplified_iteration_src_strides, + simplified_iteration_dst_strides, + iteration_src_offset, iteration_dst_offset); + } + + if ((reduction_nd == 1) && (iteration_nd == 1)) { + bool mat_reduce_over_axis1 = false; + bool mat_reduce_over_axis0 = false; + bool array_reduce_all_elems = false; + size_t iter_nelems = dst_nelems; + + if (simplified_reduction_src_strides[0] == 1) { + array_reduce_all_elems = (simplified_iteration_shape[0] == 1); + mat_reduce_over_axis1 = + (simplified_iteration_dst_strides[0] == 1) && + (static_cast(simplified_iteration_src_strides[0]) == + reduction_nelems); + } + else if (static_cast(simplified_reduction_src_strides[0]) == + iter_nelems) + { + mat_reduce_over_axis0 = + (simplified_iteration_dst_strides[0] == 1) && + (simplified_iteration_src_strides[0] == 1); + } + + if (mat_reduce_over_axis1 || array_reduce_all_elems) { + using contig_fn_ptr_T = + typename std::remove_all_extents::type; + contig_fn_ptr_T fn; + if (supports_atomics) { + fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid]; + } + else { + fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + } + if (fn != nullptr) { + sycl::event reduction_over_axis1_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis1_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis1_contig_ev); + } + } + else if (mat_reduce_over_axis0) { + using contig_fn_ptr_T = + typename std::remove_all_extents::type; + contig_fn_ptr_T fn; + if (supports_atomics) { + fn = axis0_atomic_dispatch_table[src_typeid][dst_typeid]; + } + else { + fn = axis0_temps_dispatch_table[src_typeid][dst_typeid]; + } + if (fn != nullptr) { + sycl::event reduction_over_axis0_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis0_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis0_contig_ev); + } + } + } + + // remove_all_extents gets underlying type of table + using strided_fn_ptr_T = + typename std::remove_all_extents::type; + strided_fn_ptr_T fn = nullptr; + + if (supports_atomics) { + fn = atomic_dispatch_table[src_typeid][dst_typeid]; + } + + if (fn == nullptr) { + // use slower reduction implementation using temporaries + fn = temps_dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + } + + std::vector host_task_events{}; + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &arrays_metainfo_packing_triple_ = + device_allocate_and_pack( + exec_q, host_task_events, + // iteration metadata + simplified_iteration_shape, simplified_iteration_src_strides, + simplified_iteration_dst_strides, + // reduction metadata + simplified_reduction_shape, simplified_reduction_src_strides); + py::ssize_t *temp_allocation_ptr = + std::get<0>(arrays_metainfo_packing_triple_); + if (temp_allocation_ptr == nullptr) { + throw std::runtime_error("Unable to allocate memory on device"); + } + const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + + py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + py::ssize_t *reduction_shape_stride = + temp_allocation_ptr + 3 * simplified_iteration_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.resize(depends.size()); + std::copy(depends.begin(), depends.end(), all_deps.begin()); + all_deps.push_back(copy_metadata_ev); + + auto reduction_ev = + fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(), + iteration_nd, iter_shape_and_strides, iteration_src_offset, + iteration_dst_offset, + reduction_nd, // number dimensions being reduced + reduction_shape_stride, reduction_src_offset, all_deps); + + sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(reduction_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([ctx, temp_allocation_ptr] { + sycl::free(temp_allocation_ptr, ctx); + }); + }); + host_task_events.push_back(temp_cleanup_ev); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(keep_args_event, reduction_ev); +} + +/* ================= No atomic reductions ====================== */ + +/*! @brief Template implementing Python API for reduction over axis without + * atomics */ +template +std::pair py_tree_reduction_over_axis( + const dpctl::tensor::usm_ndarray &src, + int trailing_dims_to_reduce, // comp over this many trailing indexes + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const strided_fnT &temps_dispatch_table, + const contig_fnT &axis0_temps_dispatch_table, + const contig_fnT &axis1_temps_dispatch_table) +{ + int src_nd = src.get_ndim(); + int iteration_nd = src_nd - trailing_dims_to_reduce; + if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_reduce must be positive, but no " + "greater than rank of the array being reduced"); + } + + int dst_nd = dst.get_ndim(); + if (dst_nd != iteration_nd) { + throw py::value_error("Destination array rank does not match input " + "array rank and number of reduced dimensions"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + for (int i = 0; same_shapes && (i < dst_nd); ++i) { + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); + } + + if (!same_shapes) { + throw py::value_error("Destination shape does not match unreduced " + "dimensions of the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + size_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + size_t reduction_nelems(1); + for (int i = dst_nd; i < src_nd; ++i) { + reduction_nelems *= static_cast(src_shape_ptr[i]); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + // destination must be ample enough to accommodate all elements + { + auto dst_offsets = dst.get_minmax_offsets(); + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < dst_nelems) { + throw py::value_error( + "Destination array can not accommodate all the " + "elements of source array."); + } + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + namespace td_ns = dpctl::tensor::type_dispatch; + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + // handle special case when both reduction and iteration are 1D contiguous + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + if ((is_src_c_contig && is_dst_c_contig) || + (is_src_f_contig && dst_nelems == 1)) + { + auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + size_t iter_nelems = dst_nelems; + + constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + else if (is_src_f_contig && + ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) + { + auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + size_t iter_nelems = dst_nelems; + + constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + + using dpctl::tensor::py_internal::simplify_iteration_space; + using dpctl::tensor::py_internal::simplify_iteration_space_1; + + auto const &src_shape_vecs = src.get_shape_vector(); + auto const &src_strides_vecs = src.get_strides_vector(); + auto const &dst_strides_vecs = dst.get_strides_vector(); + + int reduction_nd = trailing_dims_to_reduce; + const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd; + using shT = std::vector; + shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd, + std::end(src_strides_vecs)); + + shT simplified_reduction_shape; + shT simplified_reduction_src_strides; + py::ssize_t reduction_src_offset(0); + + simplify_iteration_space_1( + reduction_nd, reduction_shape_ptr, reduction_src_strides, + // output + simplified_reduction_shape, simplified_reduction_src_strides, + reduction_src_offset); + + const py::ssize_t *iteration_shape_ptr = src_shape_ptr; + + shT iteration_src_strides(std::begin(src_strides_vecs), + std::begin(src_strides_vecs) + iteration_nd); + shT const &iteration_dst_strides = dst_strides_vecs; + + shT simplified_iteration_shape; + shT simplified_iteration_src_strides; + shT simplified_iteration_dst_strides; + py::ssize_t iteration_src_offset(0); + py::ssize_t iteration_dst_offset(0); + + if (iteration_nd == 0) { + if (dst_nelems != 1) { + throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); + } + iteration_nd = 1; + simplified_iteration_shape.push_back(1); + simplified_iteration_src_strides.push_back(0); + simplified_iteration_dst_strides.push_back(0); + } + else { + simplify_iteration_space(iteration_nd, iteration_shape_ptr, + iteration_src_strides, iteration_dst_strides, + // output + simplified_iteration_shape, + simplified_iteration_src_strides, + simplified_iteration_dst_strides, + iteration_src_offset, iteration_dst_offset); + } + + if ((reduction_nd == 1) && (iteration_nd == 1)) { + bool mat_reduce_over_axis1 = false; + bool mat_reduce_over_axis0 = false; + bool array_reduce_all_elems = false; + size_t iter_nelems = dst_nelems; + + if (simplified_reduction_src_strides[0] == 1) { + array_reduce_all_elems = (simplified_iteration_shape[0] == 1); + mat_reduce_over_axis1 = + (simplified_iteration_dst_strides[0] == 1) && + (static_cast(simplified_iteration_src_strides[0]) == + reduction_nelems); + } + else if (static_cast(simplified_reduction_src_strides[0]) == + iter_nelems) + { + mat_reduce_over_axis0 = + (simplified_iteration_dst_strides[0] == 1) && + (simplified_iteration_src_strides[0] == 1); + } + + if (mat_reduce_over_axis1 || array_reduce_all_elems) { + auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + sycl::event reduction_over_axis1_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis1_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis1_contig_ev); + } + } + else if (mat_reduce_over_axis0) { + auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + sycl::event reduction_over_axis0_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis0_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis0_contig_ev); + } + } + } + + auto fn = temps_dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + std::vector host_task_events{}; + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &arrays_metainfo_packing_triple_ = + device_allocate_and_pack( + exec_q, host_task_events, + // iteration metadata + simplified_iteration_shape, simplified_iteration_src_strides, + simplified_iteration_dst_strides, + // reduction metadata + simplified_reduction_shape, simplified_reduction_src_strides); + py::ssize_t *temp_allocation_ptr = + std::get<0>(arrays_metainfo_packing_triple_); + if (temp_allocation_ptr == nullptr) { + throw std::runtime_error("Unable to allocate memory on device"); + } + const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + + py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + py::ssize_t *reduction_shape_stride = + temp_allocation_ptr + 3 * simplified_iteration_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.resize(depends.size()); + std::copy(depends.begin(), depends.end(), all_deps.begin()); + all_deps.push_back(copy_metadata_ev); + + auto reduction_ev = + fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(), + iteration_nd, iter_shape_and_strides, iteration_src_offset, + iteration_dst_offset, + reduction_nd, // number dimensions being reduced + reduction_shape_stride, reduction_src_offset, all_deps); + + sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(reduction_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([ctx, temp_allocation_ptr] { + sycl::free(temp_allocation_ptr, ctx); + }); + }); + host_task_events.push_back(temp_cleanup_ev); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(keep_args_event, reduction_ev); +} + +/*! @brief Template implementing Python API for searching over an axis */ +template +std::pair py_search_over_axis( + const dpctl::tensor::usm_ndarray &src, + int trailing_dims_to_reduce, // comp over this many trailing indexes + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const strided_fnT &strided_dispatch_table, + const contig_fnT &axis0_contig_dispatch_table, + const contig_fnT &axis1_contig_dispatch_table) +{ + int src_nd = src.get_ndim(); + int iteration_nd = src_nd - trailing_dims_to_reduce; + if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_reduce must be positive, but no " + "greater than rank of the array being reduced"); + } + + int dst_nd = dst.get_ndim(); + if (dst_nd != iteration_nd) { + throw py::value_error("Destination array rank does not match input " + "array rank and number of reduced dimensions"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + for (int i = 0; same_shapes && (i < dst_nd); ++i) { + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); + } + + if (!same_shapes) { + throw py::value_error("Destination shape does not match unreduced " + "dimensions of the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + size_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + size_t reduction_nelems(1); + for (int i = dst_nd; i < src_nd; ++i) { + reduction_nelems *= static_cast(src_shape_ptr[i]); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + // destination must be ample enough to accommodate all elements + { + auto dst_offsets = dst.get_minmax_offsets(); + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < dst_nelems) { + throw py::value_error( + "Destination array can not accommodate all the " + "elements of source array."); + } + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + namespace td_ns = dpctl::tensor::type_dispatch; + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + // handle special case when both reduction and iteration are 1D contiguous + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + if (is_src_c_contig && is_dst_c_contig) { + auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + size_t iter_nelems = dst_nelems; + + constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + else if (is_src_f_contig && dst_nd == 1) { + auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + size_t iter_nelems = dst_nelems; + + constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + + using dpctl::tensor::py_internal::simplify_iteration_space; + + auto const &src_shape_vecs = src.get_shape_vector(); + auto const &src_strides_vecs = src.get_strides_vector(); + auto const &dst_strides_vecs = dst.get_strides_vector(); + + int reduction_nd = trailing_dims_to_reduce; + const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd; + using shT = std::vector; + shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd, + std::end(src_strides_vecs)); + + shT compact_reduction_shape; + shT compact_reduction_src_strides; + py::ssize_t reduction_src_offset(0); + + compact_iteration_space( + reduction_nd, reduction_shape_ptr, reduction_src_strides, + // output + compact_reduction_shape, compact_reduction_src_strides); + + const py::ssize_t *iteration_shape_ptr = src_shape_ptr; + + shT iteration_src_strides(std::begin(src_strides_vecs), + std::begin(src_strides_vecs) + iteration_nd); + shT const &iteration_dst_strides = dst_strides_vecs; + + shT simplified_iteration_shape; + shT simplified_iteration_src_strides; + shT simplified_iteration_dst_strides; + py::ssize_t iteration_src_offset(0); + py::ssize_t iteration_dst_offset(0); + + if (iteration_nd == 0) { + if (dst_nelems != 1) { + throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); + } + iteration_nd = 1; + simplified_iteration_shape.push_back(1); + simplified_iteration_src_strides.push_back(0); + simplified_iteration_dst_strides.push_back(0); + } + else { + simplify_iteration_space(iteration_nd, iteration_shape_ptr, + iteration_src_strides, iteration_dst_strides, + // output + simplified_iteration_shape, + simplified_iteration_src_strides, + simplified_iteration_dst_strides, + iteration_src_offset, iteration_dst_offset); + } + + if ((reduction_nd == 1) && (iteration_nd == 1)) { + bool mat_reduce_over_axis1 = false; + bool mat_reduce_over_axis0 = false; + size_t iter_nelems = dst_nelems; + + if (compact_reduction_src_strides[0] == 1) { + mat_reduce_over_axis1 = + (simplified_iteration_dst_strides[0] == 1) && + (static_cast(simplified_iteration_src_strides[0]) == + reduction_nelems); + } + else if (static_cast(compact_reduction_src_strides[0]) == + iter_nelems) { + mat_reduce_over_axis0 = + (simplified_iteration_dst_strides[0] == 1) && + (simplified_iteration_src_strides[0] == 1); + } + + if (mat_reduce_over_axis1) { + auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + sycl::event reduction_over_axis1_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis1_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis1_contig_ev); + } + } + else if (mat_reduce_over_axis0) { + auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + sycl::event reduction_over_axis0_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis0_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis0_contig_ev); + } + } + } + + auto fn = strided_dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + std::vector host_task_events{}; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + + const auto &arrays_metainfo_packing_triple_ = + device_allocate_and_pack( + exec_q, host_task_events, + // iteration metadata + simplified_iteration_shape, simplified_iteration_src_strides, + simplified_iteration_dst_strides, + // reduction metadata + compact_reduction_shape, compact_reduction_src_strides); + py::ssize_t *temp_allocation_ptr = + std::get<0>(arrays_metainfo_packing_triple_); + if (temp_allocation_ptr == nullptr) { + throw std::runtime_error("Unable to allocate memory on device"); + } + const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + + py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + py::ssize_t *reduction_shape_stride = + temp_allocation_ptr + 3 * simplified_iteration_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.resize(depends.size()); + std::copy(depends.begin(), depends.end(), all_deps.begin()); + all_deps.push_back(copy_metadata_ev); + + auto comp_ev = fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_nd, iter_shape_and_strides, + iteration_src_offset, iteration_dst_offset, + reduction_nd, // number dimensions being reduced + reduction_shape_stride, reduction_src_offset, all_deps); + + sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(comp_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([ctx, temp_allocation_ptr] { + sycl::free(temp_allocation_ptr, ctx); + }); + }); + host_task_events.push_back(temp_cleanup_ev); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(keep_args_event, comp_ev); +} + +extern void init_reduction_functions(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/sum.cpp b/dpctl/tensor/libtensor/source/reductions/sum.cpp new file mode 100644 index 0000000000..33803cfd7b --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/sum.cpp @@ -0,0 +1,187 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "utils/type_dispatch.hpp" + +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + sum_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + sum_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + sum_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + sum_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_sum_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + using dpctl::tensor::kernels::SumOverAxisAtomicStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(sum_over_axis_strided_atomic_dispatch_table); + + using dpctl::tensor::kernels::SumOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::SumOverAxis1AtomicContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::SumOverAxis0AtomicContigFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::SumOverAxis1TempsContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(sum_over_axis1_contig_temps_dispatch_table); + + using dpctl::tensor::kernels::SumOverAxis0TempsContigFactory; + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(sum_over_axis0_contig_temps_dispatch_table); +} + +using atomic_support::atomic_support_fn_ptr_t; +static atomic_support_fn_ptr_t sum_atomic_support_vector[td_ns::num_types]; + +void populate_sum_atomic_support_dispatch_vector(void) +{ + using td_ns::DispatchVectorBuilder; + + using atomic_support::SumAtomicSupportFactory; + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(sum_atomic_support_vector); +} + +} // namespace impl + +void init_sum(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_sum_over_axis_dispatch_tables; + populate_sum_over_axis_dispatch_tables(); + using impl::sum_over_axis0_contig_atomic_dispatch_table; + using impl::sum_over_axis0_contig_temps_dispatch_table; + using impl::sum_over_axis1_contig_atomic_dispatch_table; + using impl::sum_over_axis1_contig_temps_dispatch_table; + using impl::sum_over_axis_strided_atomic_dispatch_table; + using impl::sum_over_axis_strided_temps_dispatch_table; + + using impl::populate_sum_atomic_support_dispatch_vector; + populate_sum_atomic_support_dispatch_vector(); + using impl::sum_atomic_support_vector; + + auto sum_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_reduction_over_axis; + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + sum_over_axis_strided_atomic_dispatch_table, + sum_over_axis0_contig_atomic_dispatch_table, + sum_over_axis1_contig_atomic_dispatch_table, + sum_over_axis_strided_temps_dispatch_table, + sum_over_axis0_contig_temps_dispatch_table, + sum_over_axis1_contig_temps_dispatch_table, + sum_atomic_support_vector); + }; + m.def("_sum_over_axis", sum_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto sum_dtype_supported = + [&](const py::dtype &input_dtype, const py::dtype &output_dtype, + const std::string &dst_usm_type, sycl::queue &q) { + using dpctl::tensor::py_internal::py_reduction_dtype_supported; + return py_reduction_dtype_supported( + input_dtype, output_dtype, dst_usm_type, q, + sum_over_axis_strided_atomic_dispatch_table, + sum_over_axis_strided_temps_dispatch_table, + sum_atomic_support_vector); + }; + m.def("_sum_over_axis_dtype_supported", sum_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype"), + py::arg("dst_usm_type"), py::arg("sycl_queue")); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/sum.hpp b/dpctl/tensor/libtensor/source/reductions/sum.hpp new file mode 100644 index 0000000000..ded0d14809 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/sum.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_sum(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/repeat.cpp b/dpctl/tensor/libtensor/source/repeat.cpp index 0dbfb17a5d..f3a20cbbaa 100644 --- a/dpctl/tensor/libtensor/source/repeat.cpp +++ b/dpctl/tensor/libtensor/source/repeat.cpp @@ -136,7 +136,6 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, const py::ssize_t *dst_shape = dst.get_shape_raw(); bool same_orthog_dims(true); size_t orthog_nelems(1); // number of orthogonal iterations - for (auto i = 0; i < axis; ++i) { auto src_sh_i = src_shape[i]; orthog_nelems *= src_sh_i; @@ -237,18 +236,44 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, assert(dst_shape_vec.size() == 1); assert(dst_strides_vec.size() == 1); - py::ssize_t src_shape(0); - py::ssize_t src_stride(0); - if (src_nd > 0) { - src_shape = src_shape_vec[0]; - src_stride = src_strides_vec[0]; + if (src_nd == 0) { + src_shape_vec = {0}; + src_strides_vec = {0}; + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_size_event_tuple1 = + device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + py::ssize_t *packed_src_shape_strides = + std::get<0>(ptr_size_event_tuple1); + if (packed_src_shape_strides == nullptr) { + throw std::runtime_error("Unable to allocate device memory"); } + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); - sycl::event repeat_ev = + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + repeat_ev = fn(exec_q, src_axis_nelems, src_data_p, dst_data_p, reps_data_p, - cumsum_data_p, src_shape, src_stride, dst_shape_vec[0], - dst_strides_vec[0], reps_shape_vec[0], reps_strides_vec[0], - depends); + cumsum_data_p, src_nd, packed_src_shape_strides, + dst_shape_vec[0], dst_strides_vec[0], reps_shape_vec[0], + reps_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(repeat_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_src_shape_strides] { + sycl::free(packed_src_shape_strides, ctx); + }); + }); + host_task_events.push_back(cleanup_tmp_allocations_ev); } else { // non-empty othogonal directions @@ -343,6 +368,162 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, return std::make_pair(py_obj_management_host_task_ev, repeat_ev); } +std::pair +py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &reps, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + const std::vector &depends) +{ + + int dst_nd = dst.get_ndim(); + if (dst_nd != 1) { + throw py::value_error( + "`dst` array must be 1-dimensional when repeating a full array"); + } + + int reps_nd = reps.get_ndim(); + if (reps_nd != 1) { + throw py::value_error("`reps` array must be 1-dimensional"); + } + + if (cumsum.get_ndim() != 1) { + throw py::value_error("`cumsum` array must be 1-dimensional."); + } + + if (!cumsum.is_c_contiguous()) { + throw py::value_error("Expecting `cumsum` array to be C-contiguous."); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, reps, cumsum, dst})) + { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + size_t src_sz = src.get_size(); + size_t reps_sz = reps.get_size(); + size_t cumsum_sz = cumsum.get_size(); + + // shape at repeated axis must be equal to the sum of reps + if (src_sz != reps_sz || src_sz != cumsum_sz) { + throw py::value_error("Inconsistent array dimensions"); + } + + if (src_sz == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + // ensure that dst is sufficiently ample + auto dst_offsets = dst.get_minmax_offsets(); + // destination must be ample enough to accommodate all elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < static_cast(dst.get_size())) { + throw py::value_error( + "Memory addressed by the destination array can not " + "accommodate all the " + "array elements."); + } + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + // check that dst does not intersect with src, cumsum, or reps + if (overlap(dst, src) || overlap(dst, reps) || overlap(dst, cumsum)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + int reps_typenum = reps.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + int reps_typeid = array_types.typenum_to_lookup_id(reps_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + if (src_typeid != dst_typeid) { + throw py::value_error( + "Destination array must have the same elemental data type"); + } + + constexpr int int64_typeid = static_cast(td_ns::typenum_t::INT64); + if (cumsum_typeid != int64_typeid) { + throw py::value_error( + "Unexpected data type of `cumsum` array, expecting " + "'int64'"); + } + + if (reps_typeid != cumsum_typeid) { + throw py::value_error("`reps` array must have the same elemental " + "data type as cumsum"); + } + + const char *src_data_p = src.get_data(); + const char *reps_data_p = reps.get_data(); + const char *cumsum_data_p = cumsum.get_data(); + char *dst_data_p = dst.get_data(); + + int src_nd = src.get_ndim(); + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + if (src_nd == 0) { + src_shape_vec = {0}; + src_strides_vec = {0}; + } + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + auto reps_shape_vec = reps.get_shape_vector(); + auto reps_strides_vec = reps.get_strides_vector(); + + std::vector host_task_events{}; + + auto fn = repeat_by_sequence_1d_dispatch_vector[src_typeid]; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + py::ssize_t *packed_src_shapes_strides = std::get<0>(ptr_size_event_tuple1); + if (packed_src_shapes_strides == nullptr) { + throw std::runtime_error("Unable to allocate device memory"); + } + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + sycl::event repeat_ev = fn( + exec_q, src_sz, src_data_p, dst_data_p, reps_data_p, cumsum_data_p, + src_nd, packed_src_shapes_strides, dst_shape_vec[0], dst_strides_vec[0], + reps_shape_vec[0], reps_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(repeat_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_src_shapes_strides] { + sycl::free(packed_src_shapes_strides, ctx); + }); + }); + host_task_events.push_back(cleanup_tmp_allocations_ev); + host_task_events.push_back(repeat_ev); + + sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( + exec_q, {src, reps, cumsum, dst}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, repeat_ev); +} + std::pair py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, const dpctl::tensor::usm_ndarray &dst, @@ -372,7 +553,6 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, const py::ssize_t *dst_shape = dst.get_shape_raw(); bool same_orthog_dims(true); size_t orthog_nelems(1); // number of orthogonal iterations - for (auto i = 0; i < axis; ++i) { auto src_sh_i = src_shape[i]; orthog_nelems *= src_sh_i; @@ -452,15 +632,42 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, assert(dst_shape_vec.size() == 1); assert(dst_strides_vec.size() == 1); - py::ssize_t src_shape(0); - py::ssize_t src_stride(0); - if (src_nd > 0) { - src_shape = src_shape_vec[0]; - src_stride = src_strides_vec[0]; + if (src_nd == 0) { + src_shape_vec = {0}; + src_strides_vec = {0}; } - sycl::event repeat_ev = - fn(exec_q, dst_axis_nelems, src_data_p, dst_data_p, reps, src_shape, - src_stride, dst_shape_vec[0], dst_strides_vec[0], depends); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_size_event_tuple1 = + device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + py::ssize_t *packed_src_shape_strides = + std::get<0>(ptr_size_event_tuple1); + if (packed_src_shape_strides == nullptr) { + throw std::runtime_error("Unable to allocate device memory"); + } + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + repeat_ev = fn(exec_q, dst_axis_nelems, src_data_p, dst_data_p, reps, + src_nd, packed_src_shape_strides, dst_shape_vec[0], + dst_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(repeat_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_src_shape_strides] { + sycl::free(packed_src_shape_strides, ctx); + }); + }); + host_task_events.push_back(cleanup_tmp_allocations_ev); } else { // non-empty othogonal directions @@ -554,6 +761,126 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, return std::make_pair(py_obj_management_host_task_ev, repeat_ev); } +std::pair +py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const py::ssize_t reps, + sycl::queue &exec_q, + const std::vector &depends) +{ + int dst_nd = dst.get_ndim(); + if (dst_nd != 1) { + throw py::value_error( + "`dst` array must be 1-dimensional when repeating a full array"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + size_t src_sz = src.get_size(); + size_t dst_sz = dst.get_size(); + + // shape at repeated axis must be equal to the shape of src at the axis * + // reps + if ((src_sz * reps) != dst_sz) { + throw py::value_error("Inconsistent array dimensions"); + } + + if (src_sz == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + // ensure that dst is sufficiently ample + auto dst_offsets = dst.get_minmax_offsets(); + // destination must be ample enough to accommodate all elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < static_cast(src_sz * reps)) { + throw py::value_error( + "Memory addressed by the destination array can not " + "accommodate all the " + "array elements."); + } + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + // check that dst does not intersect with src + if (overlap(dst, src)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_typeid != dst_typeid) { + throw py::value_error( + "Destination array must have the same elemental data type"); + } + + const char *src_data_p = src.get_data(); + char *dst_data_p = dst.get_data(); + + int src_nd = src.get_ndim(); + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + + if (src_nd == 0) { + src_shape_vec = {0}; + src_strides_vec = {0}; + } + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + std::vector host_task_events{}; + + auto fn = repeat_by_scalar_1d_dispatch_vector[src_typeid]; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + py::ssize_t *packed_src_shape_strides = std::get<0>(ptr_size_event_tuple1); + if (packed_src_shape_strides == nullptr) { + throw std::runtime_error("Unable to allocate device memory"); + } + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + sycl::event repeat_ev = fn(exec_q, dst_sz, src_data_p, dst_data_p, reps, + src_nd, packed_src_shape_strides, + dst_shape_vec[0], dst_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(repeat_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_src_shape_strides] { + sycl::free(packed_src_shape_strides, ctx); + }); + }); + + host_task_events.push_back(cleanup_tmp_allocations_ev); + host_task_events.push_back(repeat_ev); + + sycl::event py_obj_management_host_task_ev = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, repeat_ev); +} + } // namespace py_internal } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/repeat.hpp b/dpctl/tensor/libtensor/source/repeat.hpp index 87fb0a0847..65ace36516 100644 --- a/dpctl/tensor/libtensor/source/repeat.hpp +++ b/dpctl/tensor/libtensor/source/repeat.hpp @@ -48,6 +48,14 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, sycl::queue &exec_q, const std::vector &depends); +extern std::pair +py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &reps, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + const std::vector &depends); + extern std::pair py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, const dpctl::tensor::usm_ndarray &dst, @@ -56,6 +64,13 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, sycl::queue &exec_q, const std::vector &depends); +extern std::pair +py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const py::ssize_t reps, + sycl::queue &exec_q, + const std::vector &depends); + } // namespace py_internal } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/sum_reductions.cpp b/dpctl/tensor/libtensor/source/sum_reductions.cpp deleted file mode 100644 index 529096f5b6..0000000000 --- a/dpctl/tensor/libtensor/source/sum_reductions.cpp +++ /dev/null @@ -1,542 +0,0 @@ -//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// -// -// Data Parallel Control (dpctl) -// -// Copyright 2020-2022 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===--------------------------------------------------------------------===// -/// -/// \file -/// This file defines functions of dpctl.tensor._tensor_impl extensions -//===--------------------------------------------------------------------===// - -#include -#include -#include -#include -#include -#include - -#include "dpctl4pybind11.hpp" -#include -#include -#include - -#include "kernels/reductions.hpp" -#include "sum_reductions.hpp" - -#include "simplify_iteration_space.hpp" -#include "utils/memory_overlap.hpp" -#include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" - -namespace dpctl -{ -namespace tensor -{ -namespace py_internal -{ - -bool check_atomic_support(const sycl::queue &exec_q, - sycl::usm::alloc usm_alloc_type, - bool require_atomic64 = false) -{ - bool supports_atomics = false; - - const sycl::device &dev = exec_q.get_device(); - if (require_atomic64) { - if (!dev.has(sycl::aspect::atomic64)) - return false; - } - - switch (usm_alloc_type) { - case sycl::usm::alloc::shared: - supports_atomics = dev.has(sycl::aspect::usm_atomic_shared_allocations); - break; - case sycl::usm::alloc::host: - supports_atomics = dev.has(sycl::aspect::usm_atomic_host_allocations); - break; - case sycl::usm::alloc::device: - supports_atomics = true; - break; - default: - supports_atomics = false; - } - - return supports_atomics; -} - -using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr; -static sum_reduction_strided_impl_fn_ptr - sum_over_axis_strided_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static sum_reduction_strided_impl_fn_ptr - sum_over_axis_strided_temps_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -using dpctl::tensor::kernels::sum_reduction_contig_impl_fn_ptr; -static sum_reduction_contig_impl_fn_ptr - sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static sum_reduction_contig_impl_fn_ptr - sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -std::pair py_sum_over_axis( - const dpctl::tensor::usm_ndarray &src, - int trailing_dims_to_reduce, // sum over this many trailing indexes - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends) -{ - int src_nd = src.get_ndim(); - int iteration_nd = src_nd - trailing_dims_to_reduce; - if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) { - throw py::value_error("Trailing_dim_to_reduce must be positive, but no " - "greater than rank of the array being reduced"); - } - - int dst_nd = dst.get_ndim(); - if (dst_nd != iteration_nd) { - throw py::value_error("Destination array rank does not match input " - "array rank and number of reduced dimensions"); - } - - const py::ssize_t *src_shape_ptr = src.get_shape_raw(); - const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); - - bool same_shapes = true; - for (int i = 0; same_shapes && (i < dst_nd); ++i) { - same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); - } - - if (!same_shapes) { - throw py::value_error("Destination shape does not match unreduced " - "dimensions of the input shape"); - } - - if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { - throw py::value_error( - "Execution queue is not compatible with allocation queues"); - } - - size_t dst_nelems = dst.get_size(); - - size_t reduction_nelems(1); - for (int i = dst_nd; i < src_nd; ++i) { - reduction_nelems *= static_cast(src_shape_ptr[i]); - } - - // check that dst and src do not overlap - auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); - if (overlap(src, dst)) { - throw py::value_error("Arrays index overlapping segments of memory"); - } - - // destination must be ample enough to accommodate all elements - { - auto dst_offsets = dst.get_minmax_offsets(); - size_t range = - static_cast(dst_offsets.second - dst_offsets.first); - if (range + 1 < dst_nelems) { - throw py::value_error( - "Destination array can not accommodate all the " - "elements of source array."); - } - } - - int src_typenum = src.get_typenum(); - int dst_typenum = dst.get_typenum(); - - const auto &array_types = td_ns::usm_ndarray_types(); - int src_typeid = array_types.typenum_to_lookup_id(src_typenum); - int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); - - int dst_itemsize = dst.get_elemsize(); - bool supports_atomics = false; - - switch (dst_itemsize) { - case sizeof(float): - { - void *data_ptr = dst.get_data(); - const auto &ctx = exec_q.get_context(); - auto usm_type = sycl::get_pointer_type(data_ptr, ctx); - supports_atomics = check_atomic_support(exec_q, usm_type); - } break; - case sizeof(double): - { - void *data_ptr = dst.get_data(); - const auto &ctx = exec_q.get_context(); - auto usm_type = sycl::get_pointer_type(data_ptr, ctx); - - constexpr bool require_atomic64 = true; - supports_atomics = - check_atomic_support(exec_q, usm_type, require_atomic64); - } break; - } - - // handle special case when both reduction and iteration are 1D contiguous - // and can be done with atomics - if (supports_atomics) { - bool is_src_c_contig = src.is_c_contiguous(); - bool is_dst_c_contig = dst.is_c_contiguous(); - bool is_src_f_contig = src.is_f_contiguous(); - - if ((is_src_c_contig && is_dst_c_contig) || - (is_src_f_contig && dst_nelems == 1)) - { - auto fn = sum_over_axis1_contig_atomic_dispatch_table[src_typeid] - [dst_typeid]; - if (fn != nullptr) { - size_t iter_nelems = dst_nelems; - - constexpr py::ssize_t zero_offset = 0; - - sycl::event sum_over_axis_contig_ev = - fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), - dst.get_data(), - zero_offset, // iteration_src_offset - zero_offset, // iteration_dst_offset - zero_offset, // reduction_src_offset - depends); - - sycl::event keep_args_event = dpctl::utils::keep_args_alive( - exec_q, {src, dst}, {sum_over_axis_contig_ev}); - - return std::make_pair(keep_args_event, sum_over_axis_contig_ev); - } - } - else if (is_src_f_contig && - ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) - { - auto fn = sum_over_axis0_contig_atomic_dispatch_table[src_typeid] - [dst_typeid]; - if (fn != nullptr) { - size_t iter_nelems = dst_nelems; - - constexpr py::ssize_t zero_offset = 0; - - sycl::event sum_over_axis_contig_ev = - fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), - dst.get_data(), - zero_offset, // iteration_src_offset - zero_offset, // iteration_dst_offset - zero_offset, // reduction_src_offset - depends); - - sycl::event keep_args_event = dpctl::utils::keep_args_alive( - exec_q, {src, dst}, {sum_over_axis_contig_ev}); - - return std::make_pair(keep_args_event, sum_over_axis_contig_ev); - } - } - } - - using dpctl::tensor::py_internal::simplify_iteration_space; - using dpctl::tensor::py_internal::simplify_iteration_space_1; - - auto const &src_shape_vecs = src.get_shape_vector(); - auto const &src_strides_vecs = src.get_strides_vector(); - auto const &dst_strides_vecs = dst.get_strides_vector(); - - int reduction_nd = trailing_dims_to_reduce; - const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd; - using shT = std::vector; - shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd, - std::end(src_strides_vecs)); - - shT simplified_reduction_shape; - shT simplified_reduction_src_strides; - py::ssize_t reduction_src_offset(0); - - simplify_iteration_space_1( - reduction_nd, reduction_shape_ptr, reduction_src_strides, - // output - simplified_reduction_shape, simplified_reduction_src_strides, - reduction_src_offset); - - const py::ssize_t *iteration_shape_ptr = src_shape_ptr; - - shT iteration_src_strides(std::begin(src_strides_vecs), - std::begin(src_strides_vecs) + iteration_nd); - shT const &iteration_dst_strides = dst_strides_vecs; - - shT simplified_iteration_shape; - shT simplified_iteration_src_strides; - shT simplified_iteration_dst_strides; - py::ssize_t iteration_src_offset(0); - py::ssize_t iteration_dst_offset(0); - - if (iteration_nd == 0) { - if (dst_nelems != 1) { - throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); - } - iteration_nd = 1; - simplified_iteration_shape.push_back(1); - simplified_iteration_src_strides.push_back(0); - simplified_iteration_dst_strides.push_back(0); - } - else { - simplify_iteration_space(iteration_nd, iteration_shape_ptr, - iteration_src_strides, iteration_dst_strides, - // output - simplified_iteration_shape, - simplified_iteration_src_strides, - simplified_iteration_dst_strides, - iteration_src_offset, iteration_dst_offset); - } - - if (supports_atomics && (reduction_nd == 1) && (iteration_nd == 1)) { - bool mat_reduce_over_axis1 = false; - bool mat_reduce_over_axis0 = false; - bool array_reduce_all_elems = false; - size_t iter_nelems = dst_nelems; - - if (simplified_reduction_src_strides[0] == 1) { - array_reduce_all_elems = (simplified_iteration_shape[0] == 1); - mat_reduce_over_axis1 = - (simplified_iteration_dst_strides[0] == 1) && - (static_cast(simplified_iteration_src_strides[0]) == - reduction_nelems); - } - else if (static_cast(simplified_reduction_src_strides[0]) == - iter_nelems) - { - mat_reduce_over_axis0 = - (simplified_iteration_dst_strides[0] == 1) && - (simplified_iteration_src_strides[0] == 1); - } - - if (mat_reduce_over_axis1 || array_reduce_all_elems) { - auto fn = sum_over_axis1_contig_atomic_dispatch_table[src_typeid] - [dst_typeid]; - if (fn != nullptr) { - sycl::event sum_over_axis1_contig_ev = - fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), - dst.get_data(), iteration_src_offset, - iteration_dst_offset, reduction_src_offset, depends); - - sycl::event keep_args_event = dpctl::utils::keep_args_alive( - exec_q, {src, dst}, {sum_over_axis1_contig_ev}); - - return std::make_pair(keep_args_event, - sum_over_axis1_contig_ev); - } - } - else if (mat_reduce_over_axis0) { - auto fn = sum_over_axis0_contig_atomic_dispatch_table[src_typeid] - [dst_typeid]; - if (fn != nullptr) { - sycl::event sum_over_axis0_contig_ev = - fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), - dst.get_data(), iteration_src_offset, - iteration_dst_offset, reduction_src_offset, depends); - - sycl::event keep_args_event = dpctl::utils::keep_args_alive( - exec_q, {src, dst}, {sum_over_axis0_contig_ev}); - - return std::make_pair(keep_args_event, - sum_over_axis0_contig_ev); - } - } - } - - using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr; - sum_reduction_strided_impl_fn_ptr fn = nullptr; - - if (supports_atomics) { - fn = - sum_over_axis_strided_atomic_dispatch_table[src_typeid][dst_typeid]; - } - - if (fn == nullptr) { - // use slower reduction implementation using temporaries - fn = sum_over_axis_strided_temps_dispatch_table[src_typeid][dst_typeid]; - if (fn == nullptr) { - throw std::runtime_error("Datatypes are not supported"); - } - } - - std::vector host_task_events{}; - - using dpctl::tensor::offset_utils::device_allocate_and_pack; - - const auto &arrays_metainfo_packing_triple_ = - device_allocate_and_pack( - exec_q, host_task_events, - // iteration metadata - simplified_iteration_shape, simplified_iteration_src_strides, - simplified_iteration_dst_strides, - // reduction metadata - simplified_reduction_shape, simplified_reduction_src_strides); - py::ssize_t *temp_allocation_ptr = - std::get<0>(arrays_metainfo_packing_triple_); - if (temp_allocation_ptr == nullptr) { - throw std::runtime_error("Unable to allocate memory on device"); - } - const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); - - py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; - py::ssize_t *reduction_shape_stride = - temp_allocation_ptr + 3 * simplified_iteration_shape.size(); - - std::vector all_deps; - all_deps.reserve(depends.size() + 1); - all_deps.resize(depends.size()); - std::copy(depends.begin(), depends.end(), all_deps.begin()); - all_deps.push_back(copy_metadata_ev); - - auto comp_ev = fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), - dst.get_data(), iteration_nd, iter_shape_and_strides, - iteration_src_offset, iteration_dst_offset, - reduction_nd, // number dimensions being reduced - reduction_shape_stride, reduction_src_offset, all_deps); - - sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(comp_ev); - const auto &ctx = exec_q.get_context(); - cgh.host_task([ctx, temp_allocation_ptr] { - sycl::free(temp_allocation_ptr, ctx); - }); - }); - host_task_events.push_back(temp_cleanup_ev); - - sycl::event keep_args_event = - dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); - - return std::make_pair(keep_args_event, comp_ev); -} - -bool py_sum_over_axis_dtype_supported(const py::dtype &input_dtype, - const py::dtype &output_dtype, - const std::string &dst_usm_type, - sycl::queue &q) -{ - int arg_tn = - input_dtype.num(); // NumPy type numbers are the same as in dpctl - int out_tn = - output_dtype.num(); // NumPy type numbers are the same as in dpctl - int arg_typeid = -1; - int out_typeid = -1; - - auto array_types = td_ns::usm_ndarray_types(); - - try { - arg_typeid = array_types.typenum_to_lookup_id(arg_tn); - out_typeid = array_types.typenum_to_lookup_id(out_tn); - } catch (const std::exception &e) { - throw py::value_error(e.what()); - } - - if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 || - out_typeid >= td_ns::num_types) - { - throw std::runtime_error("Reduction type support check: lookup failed"); - } - - using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr; - sum_reduction_strided_impl_fn_ptr fn = nullptr; - - sycl::usm::alloc kind = sycl::usm::alloc::unknown; - - if (dst_usm_type == "device") { - kind = sycl::usm::alloc::device; - } - else if (dst_usm_type == "shared") { - kind = sycl::usm::alloc::shared; - } - else if (dst_usm_type == "host") { - kind = sycl::usm::alloc::host; - } - else { - throw py::value_error("Unrecognized `dst_usm_type` argument."); - } - - bool supports_atomics = false; - - switch (output_dtype.itemsize()) { - case sizeof(float): - { - supports_atomics = check_atomic_support(q, kind); - } break; - case sizeof(double): - { - constexpr bool require_atomic64 = true; - supports_atomics = check_atomic_support(q, kind, require_atomic64); - } break; - } - - if (supports_atomics) { - fn = - sum_over_axis_strided_atomic_dispatch_table[arg_typeid][out_typeid]; - } - - if (fn == nullptr) { - // use slower reduction implementation using temporaries - fn = sum_over_axis_strided_temps_dispatch_table[arg_typeid][out_typeid]; - } - - return (fn != nullptr); -} - -void populate_sum_over_axis_dispatch_table(void) -{ - using dpctl::tensor::kernels::sum_reduction_contig_impl_fn_ptr; - using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr; - using namespace td_ns; - - using dpctl::tensor::kernels::SumOverAxisAtomicStridedFactory; - DispatchTableBuilder - dtb1; - dtb1.populate_dispatch_table(sum_over_axis_strided_atomic_dispatch_table); - - using dpctl::tensor::kernels::SumOverAxisTempsStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table); - - using dpctl::tensor::kernels::SumOverAxis1AtomicContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table); - - using dpctl::tensor::kernels::SumOverAxis0AtomicContigFactory; - DispatchTableBuilder - dtb4; - dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table); -} - -namespace py = pybind11; - -void init_reduction_functions(py::module_ m) -{ - populate_sum_over_axis_dispatch_table(); - - m.def("_sum_over_axis", &py_sum_over_axis, "", py::arg("src"), - py::arg("trailing_dims_to_reduce"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - m.def("_sum_over_axis_dtype_supported", &py_sum_over_axis_dtype_supported, - "", py::arg("arg_dtype"), py::arg("out_dtype"), - py::arg("dst_usm_type"), py::arg("sycl_queue")); -} - -} // namespace py_internal -} // namespace tensor -} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_ctors.cpp similarity index 87% rename from dpctl/tensor/libtensor/source/tensor_py.cpp rename to dpctl/tensor/libtensor/source/tensor_ctors.cpp index 2ce7c72add..4720f6baa1 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_ctors.cpp @@ -1,4 +1,5 @@ -//===-- tensor_py.cpp - Implementation of _tensor_impl module --*-C++-*-/===// +//===-- tensor_ctors.cpp - ---*-C++-*-/===// +// Implementation of _tensor_impl module // // Data Parallel Control (dpctl) // @@ -30,25 +31,24 @@ #include #include #include +#include #include "dpctl4pybind11.hpp" #include "accumulators.hpp" #include "boolean_advanced_indexing.hpp" -#include "boolean_reductions.hpp" +#include "clip.hpp" #include "copy_and_cast_usm_to_usm.hpp" #include "copy_for_reshape.hpp" #include "copy_for_roll.hpp" #include "copy_numpy_ndarray_into_usm_ndarray.hpp" #include "device_support_queries.hpp" -#include "elementwise_functions.hpp" #include "eye_ctor.hpp" #include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" #include "linear_sequences.hpp" #include "repeat.hpp" #include "simplify_iteration_space.hpp" -#include "sum_reductions.hpp" #include "triul_ctor.hpp" #include "utils/memory_overlap.hpp" #include "utils/strided_iters.hpp" @@ -115,6 +115,9 @@ using dpctl::tensor::py_internal::usm_ndarray_triul; using dpctl::tensor::py_internal::py_where; +/* =========================== Clip ============================== */ +using dpctl::tensor::py_internal::py_clip; + // populate dispatch tables void init_dispatch_tables(void) { @@ -147,6 +150,8 @@ void init_dispatch_vectors(void) populate_cumsum_1d_dispatch_vectors(); init_repeat_dispatch_vectors(); + init_clip_dispatch_vectors(); + return; } @@ -402,15 +407,49 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_repeat_by_sequence", &py_repeat_by_sequence, "", py::arg("src"), + auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &reps, + const dpctl::tensor::usm_ndarray &cumsum, + std::optional axis, sycl::queue &exec_q, + const std::vector depends) + -> std::pair { + if (axis) { + return py_repeat_by_sequence(src, dst, reps, cumsum, axis.value(), + exec_q, depends); + } + else { + return py_repeat_by_sequence(src, dst, reps, cumsum, exec_q, + depends); + } + }; + m.def("_repeat_by_sequence", repeat_sequence, py::arg("src"), py::arg("dst"), py::arg("reps"), py::arg("cumsum"), py::arg("axis"), py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_repeat_by_scalar", &py_repeat_by_scalar, "", py::arg("src"), - py::arg("dst"), py::arg("reps"), py::arg("axis"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); + auto repeat_scalar = [](const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const py::ssize_t reps, std::optional axis, + sycl::queue &exec_q, + const std::vector depends) + -> std::pair { + if (axis) { + return py_repeat_by_scalar(src, dst, reps, axis.value(), exec_q, + depends); + } + else { + return py_repeat_by_scalar(src, dst, reps, exec_q, depends); + } + }; + m.def("_repeat_by_scalar", repeat_scalar, py::arg("src"), py::arg("dst"), + py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); - dpctl::tensor::py_internal::init_elementwise_functions(m); - dpctl::tensor::py_internal::init_boolean_reduction_functions(m); - dpctl::tensor::py_internal::init_reduction_functions(m); + m.def("_clip", &py_clip, + "Clamps elements of array `x` to the range " + "[`min`, `max] and writes the result to the " + "array `dst` for each element of `x`, `min`, and `max`." + "Returns a tuple of events: (hev, ev)", + py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); } diff --git a/dpctl/tensor/libtensor/source/tensor_elementwise.cpp b/dpctl/tensor/libtensor/source/tensor_elementwise.cpp new file mode 100644 index 0000000000..1a86526893 --- /dev/null +++ b/dpctl/tensor/libtensor/source/tensor_elementwise.cpp @@ -0,0 +1,34 @@ +//===-- tensor_elementwise.cpp ---*-C++-*-/===// +// Implementation of _tensor_elementwise_impl module +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include "elementwise_functions/elementwise_common.hpp" +#include + +namespace py = pybind11; + +PYBIND11_MODULE(_tensor_elementwise_impl, m) +{ + dpctl::tensor::py_internal::init_elementwise_functions(m); +} diff --git a/dpctl/tensor/libtensor/source/tensor_reductions.cpp b/dpctl/tensor/libtensor/source/tensor_reductions.cpp new file mode 100644 index 0000000000..138c31f3eb --- /dev/null +++ b/dpctl/tensor/libtensor/source/tensor_reductions.cpp @@ -0,0 +1,37 @@ +//===-- tensor_reductions.cpp - --*-C++-*-/===// +// Implementation of _tensor_reductions_impl module +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include + +#include "boolean_reductions.hpp" +#include "reductions/reduction_common.hpp" + +namespace py = pybind11; + +PYBIND11_MODULE(_tensor_reductions_impl, m) +{ + dpctl::tensor::py_internal::init_boolean_reduction_functions(m); + dpctl::tensor::py_internal::init_reduction_functions(m); +} diff --git a/dpctl/tests/elementwise/test_bitwise_and.py b/dpctl/tests/elementwise/test_bitwise_and.py index b3a5bd665b..824e319709 100644 --- a/dpctl/tests/elementwise/test_bitwise_and.py +++ b/dpctl/tests/elementwise/test_bitwise_and.py @@ -18,6 +18,7 @@ import pytest import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported from .utils import _integral_dtypes @@ -85,3 +86,58 @@ def test_bitwise_and_bool(): r_lo = dpt.logical_and(x1[:, dpt.newaxis], x2[dpt.newaxis]) assert dpt.all(dpt.equal(r_bw, r_lo)) + + +@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes) +def test_bitwise_and_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind == "b": + X &= False + else: + X &= int(0) + + +@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes) +@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes) +def test_bitwise_and_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64): + ar1 &= ar2 + assert dpt.all(ar1 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 &= ar4 + assert dpt.all(ar3 == 1) + else: + with pytest.raises(TypeError): + ar1 &= ar2 + dpt.bitwise_and(ar1, ar2, out=ar1) + + # out is second arg + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64): + dpt.bitwise_and(ar1, ar2, out=ar2) + assert dpt.all(ar2 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + dpt.bitwise_and(ar3, ar4, out=ar4) + dpt.all(ar4 == 1) + else: + with pytest.raises(TypeError): + dpt.bitwise_and(ar1, ar2, out=ar2) diff --git a/dpctl/tests/elementwise/test_bitwise_left_shift.py b/dpctl/tests/elementwise/test_bitwise_left_shift.py index cee1019353..06684ac13b 100644 --- a/dpctl/tests/elementwise/test_bitwise_left_shift.py +++ b/dpctl/tests/elementwise/test_bitwise_left_shift.py @@ -18,6 +18,7 @@ import pytest import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported from .utils import _integral_dtypes @@ -97,3 +98,54 @@ def test_bitwise_left_shift_range(op_dtype): z = dpt.bitwise_left_shift(x, y) assert dpt.all(dpt.equal(z, 0)) + + +@pytest.mark.parametrize("dtype", _integral_dtypes) +def test_bitwise_left_shift_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + X <<= int(0) + + +@pytest.mark.parametrize("op1_dtype", _integral_dtypes) +@pytest.mark.parametrize("op2_dtype", _integral_dtypes) +def test_bitwise_left_shift_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64): + ar1 <<= ar2 + assert dpt.all(ar1 == 2) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 <<= ar4 + assert dpt.all(ar3 == 2) + else: + with pytest.raises(TypeError): + ar1 <<= ar2 + dpt.bitwise_left_shift(ar1, ar2, out=ar1) + + # out is second arg + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64): + dpt.bitwise_left_shift(ar1, ar2, out=ar2) + assert dpt.all(ar2 == 2) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + dpt.bitwise_left_shift(ar3, ar4, out=ar4) + dpt.all(ar4 == 2) + else: + with pytest.raises(TypeError): + dpt.bitwise_left_shift(ar1, ar2, out=ar2) diff --git a/dpctl/tests/elementwise/test_bitwise_or.py b/dpctl/tests/elementwise/test_bitwise_or.py index d273bd1507..49949cb795 100644 --- a/dpctl/tests/elementwise/test_bitwise_or.py +++ b/dpctl/tests/elementwise/test_bitwise_or.py @@ -18,6 +18,7 @@ import pytest import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported from .utils import _integral_dtypes @@ -85,3 +86,58 @@ def test_bitwise_or_bool(): r_lo = dpt.logical_or(x1[:, dpt.newaxis], x2[dpt.newaxis]) assert dpt.all(dpt.equal(r_bw, r_lo)) + + +@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes) +def test_bitwise_or_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind == "b": + X |= False + else: + X |= int(0) + + +@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes) +@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes) +def test_bitwise_or_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64): + ar1 |= ar2 + assert dpt.all(ar1 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 |= ar4 + assert dpt.all(ar3 == 1) + else: + with pytest.raises(TypeError): + ar1 |= ar2 + dpt.bitwise_or(ar1, ar2, out=ar1) + + # out is second arg + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64): + dpt.bitwise_or(ar1, ar2, out=ar2) + assert dpt.all(ar2 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + dpt.bitwise_or(ar3, ar4, out=ar4) + dpt.all(ar4 == 1) + else: + with pytest.raises(TypeError): + dpt.bitwise_or(ar1, ar2, out=ar2) diff --git a/dpctl/tests/elementwise/test_bitwise_right_shift.py b/dpctl/tests/elementwise/test_bitwise_right_shift.py index ceadb9414d..37112133db 100644 --- a/dpctl/tests/elementwise/test_bitwise_right_shift.py +++ b/dpctl/tests/elementwise/test_bitwise_right_shift.py @@ -18,6 +18,7 @@ import pytest import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported from .utils import _integral_dtypes @@ -97,3 +98,54 @@ def test_bitwise_right_shift_range(op_dtype): z = dpt.bitwise_right_shift(x, y) assert dpt.all(dpt.equal(z, 0)) + + +@pytest.mark.parametrize("dtype", _integral_dtypes) +def test_bitwise_right_shift_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + X >>= int(0) + + +@pytest.mark.parametrize("op1_dtype", _integral_dtypes) +@pytest.mark.parametrize("op2_dtype", _integral_dtypes) +def test_bitwise_right_shift_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64): + ar1 >>= ar2 + assert dpt.all(ar1 == 0) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 >>= ar4 + assert dpt.all(ar3 == 0) + else: + with pytest.raises(TypeError): + ar1 >>= ar2 + dpt.bitwise_right_shift(ar1, ar2, out=ar1) + + # out is second arg + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64): + dpt.bitwise_right_shift(ar1, ar2, out=ar2) + assert dpt.all(ar2 == 0) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + dpt.bitwise_right_shift(ar3, ar4, out=ar4) + dpt.all(ar4 == 0) + else: + with pytest.raises(TypeError): + dpt.bitwise_right_shift(ar1, ar2, out=ar2) diff --git a/dpctl/tests/elementwise/test_bitwise_xor.py b/dpctl/tests/elementwise/test_bitwise_xor.py index b2cb11bc84..e9501b642f 100644 --- a/dpctl/tests/elementwise/test_bitwise_xor.py +++ b/dpctl/tests/elementwise/test_bitwise_xor.py @@ -18,6 +18,7 @@ import pytest import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported from .utils import _integral_dtypes @@ -85,3 +86,58 @@ def test_bitwise_xor_bool(): r_lo = dpt.logical_xor(x1[:, dpt.newaxis], x2[dpt.newaxis]) assert dpt.all(dpt.equal(r_bw, r_lo)) + + +@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes) +def test_bitwise_xor_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind == "b": + X ^= False + else: + X ^= int(0) + + +@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes) +@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes) +def test_bitwise_xor_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64): + ar1 ^= ar2 + assert dpt.all(ar1 == 0) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 ^= ar4 + assert dpt.all(ar3 == 0) + else: + with pytest.raises(TypeError): + ar1 ^= ar2 + dpt.bitwise_xor(ar1, ar2, out=ar1) + + # out is second arg + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64): + dpt.bitwise_xor(ar1, ar2, out=ar2) + assert dpt.all(ar2 == 0) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + dpt.bitwise_xor(ar3, ar4, out=ar4) + dpt.all(ar4 == 0) + else: + with pytest.raises(TypeError): + dpt.bitwise_xor(ar1, ar2, out=ar2) diff --git a/dpctl/tests/elementwise/test_cbrt.py b/dpctl/tests/elementwise/test_cbrt.py new file mode 100644 index 0000000000..b06a8d19cf --- /dev/null +++ b/dpctl/tests/elementwise/test_cbrt.py @@ -0,0 +1,79 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _map_to_device_dtype, _no_complex_dtypes, _real_fp_dtypes + + +@pytest.mark.parametrize("dtype", _no_complex_dtypes) +def test_cbrt_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + expected_dtype = np.cbrt(np.array(0, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.cbrt(X).dtype == expected_dtype + + +@pytest.mark.parametrize("dtype", _real_fp_dtypes) +def test_cbrt_output_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 1027 + + X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q) + Xnp = dpt.asnumpy(X) + + Y = dpt.cbrt(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.cbrt(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _real_fp_dtypes) +def test_cbrt_output_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 2054 + + X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2] + Xnp = dpt.asnumpy(X) + + Y = dpt.cbrt(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.cbrt(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.usefixtures("suppress_invalid_numpy_warnings") +def test_cbrt_special_cases(): + get_queue_or_skip() + + X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4") + res = dpt.cbrt(X) + expected = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4") + tol = dpt.finfo(dpt.float32).resolution + + assert dpt.allclose(res, expected, atol=tol, rtol=tol, equal_nan=True) diff --git a/dpctl/tests/elementwise/test_copysign.py b/dpctl/tests/elementwise/test_copysign.py new file mode 100644 index 0000000000..26a285343c --- /dev/null +++ b/dpctl/tests/elementwise/test_copysign.py @@ -0,0 +1,111 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ctypes + +import numpy as np +import pytest + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _compare_dtypes, _no_complex_dtypes, _real_fp_dtypes + + +@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes) +@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes) +def test_copysign_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.copysign(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.copysign( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.copysign(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.copysign( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + +@pytest.mark.parametrize("arr_dt", _real_fp_dtypes) +def test_copysign_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q) + py_ones = ( + bool(1), + int(1), + float(1), + np.float32(1), + ctypes.c_int(1), + ) + for sc in py_ones: + R = dpt.copysign(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.copysign(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +@pytest.mark.parametrize("dt", _real_fp_dtypes) +def test_copysign(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x = dpt.arange(100, dtype=dt, sycl_queue=q) + x[1::2] *= -1 + y = dpt.ones(100, dtype=dt, sycl_queue=q) + y[::2] *= -1 + res = dpt.copysign(x, y) + expected = dpt.negative(x) + tol = dpt.finfo(dt).resolution + assert dpt.allclose(res, expected, atol=tol, rtol=tol) + + +def test_copysign_special_values(): + get_queue_or_skip() + + x1 = dpt.asarray([1.0, 0.0, dpt.nan, dpt.nan], dtype="f4") + y1 = dpt.asarray([-1.0, -0.0, -dpt.nan, -1], dtype="f4") + res = dpt.copysign(x1, y1) + assert dpt.all(dpt.signbit(res)) + x2 = dpt.asarray([-1.0, -0.0, -dpt.nan, -dpt.nan], dtype="f4") + res = dpt.copysign(x2, y1) + assert dpt.all(dpt.signbit(res)) + y2 = dpt.asarray([0.0, 1.0, dpt.nan, 1.0], dtype="f4") + res = dpt.copysign(x2, y2) + assert not dpt.any(dpt.signbit(res)) + res = dpt.copysign(x1, y2) + assert not dpt.any(dpt.signbit(res)) diff --git a/dpctl/tests/elementwise/test_divide.py b/dpctl/tests/elementwise/test_divide.py index 41aac736d7..a54060792c 100644 --- a/dpctl/tests/elementwise/test_divide.py +++ b/dpctl/tests/elementwise/test_divide.py @@ -21,9 +21,16 @@ import dpctl import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported -from .utils import _all_dtypes, _compare_dtypes, _usm_types +from .utils import ( + _all_dtypes, + _compare_dtypes, + _complex_fp_dtypes, + _real_fp_dtypes, + _usm_types, +) @pytest.mark.parametrize("op1_dtype", _all_dtypes) @@ -187,3 +194,65 @@ def __sycl_usm_array_interface__(self): c = Canary() with pytest.raises(ValueError): dpt.divide(a, c) + + +@pytest.mark.parametrize("dtype", _real_fp_dtypes + _complex_fp_dtypes) +def test_divide_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind == "f": + X /= float(1) + elif dt_kind == "c": + X /= complex(1) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_divide_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + # out array only valid if it is inexact + if ( + _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64) + and dpt.dtype(op1_dtype).kind in "fc" + ): + ar1 /= ar2 + assert dpt.all(ar1 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 /= ar4 + assert dpt.all(ar3 == 1) + else: + with pytest.raises(TypeError): + ar1 /= ar2 + dpt.divide(ar1, ar2, out=ar1) + + # out is second arg + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + if ( + _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64) + and dpt.dtype(op2_dtype).kind in "fc" + ): + dpt.divide(ar1, ar2, out=ar2) + assert dpt.all(ar2 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + dpt.divide(ar3, ar4, out=ar4) + dpt.all(ar4 == 1) + else: + with pytest.raises(TypeError): + dpt.divide(ar1, ar2, out=ar2) diff --git a/dpctl/tests/elementwise/test_elementwise_classes.py b/dpctl/tests/elementwise/test_elementwise_classes.py new file mode 100644 index 0000000000..b7f1d26d6e --- /dev/null +++ b/dpctl/tests/elementwise/test_elementwise_classes.py @@ -0,0 +1,80 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dpctl.tensor as dpt + +unary_fn = dpt.negative +binary_fn = dpt.divide + + +def test_unary_class_getters(): + fn = unary_fn.get_implementation_function() + assert callable(fn) + + fn = unary_fn.get_type_result_resolver_function() + assert callable(fn) + + +def test_unary_class_types_property(): + loop_types = unary_fn.types + assert isinstance(loop_types, list) + assert len(loop_types) > 0 + assert all(isinstance(sig, str) for sig in loop_types) + assert all("->" in sig for sig in loop_types) + + +def test_unary_class_str_repr(): + s = str(unary_fn) + r = repr(unary_fn) + + assert isinstance(s, str) + assert isinstance(r, str) + kl_n = unary_fn.__name__ + assert kl_n in s + assert kl_n in r + + +def test_binary_class_getters(): + fn = binary_fn.get_implementation_function() + assert callable(fn) + + fn = binary_fn.get_implementation_inplace_function() + assert callable(fn) + + fn = binary_fn.get_type_result_resolver_function() + assert callable(fn) + + fn = binary_fn.get_type_promotion_path_acceptance_function() + assert callable(fn) + + +def test_binary_class_types_property(): + loop_types = binary_fn.types + assert isinstance(loop_types, list) + assert len(loop_types) > 0 + assert all(isinstance(sig, str) for sig in loop_types) + assert all("->" in sig for sig in loop_types) + + +def test_binary_class_str_repr(): + s = str(binary_fn) + r = repr(binary_fn) + + assert isinstance(s, str) + assert isinstance(r, str) + kl_n = binary_fn.__name__ + assert kl_n in s + assert kl_n in r diff --git a/dpctl/tests/elementwise/test_exp2.py b/dpctl/tests/elementwise/test_exp2.py new file mode 100644 index 0000000000..d4bef1efab --- /dev/null +++ b/dpctl/tests/elementwise/test_exp2.py @@ -0,0 +1,168 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _all_dtypes, _map_to_device_dtype, _usm_types + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_exp2_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + expected_dtype = np.exp2(np.array(0, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.exp2(X).dtype == expected_dtype + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_exp2_output_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 1027 + + X = dpt.linspace(1, 5, num=n_seq, dtype=dtype, sycl_queue=q) + Xnp = dpt.asnumpy(X) + + Y = dpt.exp2(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.exp2(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_exp2_output_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 2 * 1027 + + X = dpt.linspace(1, 5, num=n_seq, dtype=dtype, sycl_queue=q)[::-2] + Xnp = dpt.asnumpy(X) + + Y = dpt.exp2(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.exp2(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("usm_type", _usm_types) +def test_exp2_usm_type(usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("f4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = 1 / 4 + X[..., 1::2] = 1 / 2 + + Y = dpt.exp2(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = np.empty(input_shape, dtype=arg_dt) + expected_Y[..., 0::2] = np.exp2(np.float32(1 / 4)) + expected_Y[..., 1::2] = np.exp2(np.float32(1 / 2)) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_exp2_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = 1 / 4 + X[..., 1::2] = 1 / 2 + + for ord in ["C", "F", "A", "K"]: + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + Y = dpt.exp2(U, order=ord) + expected_Y = np.exp2(dpt.asnumpy(U)) + tol = 8 * max( + dpt.finfo(Y.dtype).resolution, + np.finfo(expected_Y.dtype).resolution, + ) + assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +def test_exp2_special_cases(): + get_queue_or_skip() + + X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4") + res = np.asarray([np.nan, 1.0, 1.0, np.inf, 0.0], dtype="f4") + + tol = dpt.finfo(X.dtype).resolution + assert_allclose(dpt.asnumpy(dpt.exp2(X)), res, atol=tol, rtol=tol) + + # special cases for complex variant + num_finite = 1.0 + vals = [ + complex(0.0, 0.0), + complex(num_finite, dpt.inf), + complex(num_finite, dpt.nan), + complex(dpt.inf, 0.0), + complex(-dpt.inf, num_finite), + complex(dpt.inf, num_finite), + complex(-dpt.inf, dpt.inf), + complex(dpt.inf, dpt.inf), + complex(-dpt.inf, dpt.nan), + complex(dpt.inf, dpt.nan), + complex(dpt.nan, 0.0), + complex(dpt.nan, num_finite), + complex(dpt.nan, dpt.nan), + ] + X = dpt.asarray(vals, dtype=dpt.complex64) + cis_1 = complex(np.cos(num_finite), np.sin(num_finite)) + c_nan = complex(np.nan, np.nan) + res = np.asarray( + [ + complex(1.0, 0.0), + c_nan, + c_nan, + complex(np.inf, 0.0), + 0.0, + np.inf * cis_1, + complex(0.0, 0.0), + complex(np.inf, np.nan), + complex(0.0, 0.0), + complex(np.inf, np.nan), + complex(np.nan, 0.0), + c_nan, + c_nan, + ], + dtype=np.complex64, + ) + + tol = dpt.finfo(X.dtype).resolution + with np.errstate(invalid="ignore"): + assert_allclose(dpt.asnumpy(dpt.exp2(X)), res, atol=tol, rtol=tol) diff --git a/dpctl/tests/elementwise/test_floor_divide.py b/dpctl/tests/elementwise/test_floor_divide.py index c8ba5e80f1..b57c006cdf 100644 --- a/dpctl/tests/elementwise/test_floor_divide.py +++ b/dpctl/tests/elementwise/test_floor_divide.py @@ -21,13 +21,19 @@ import dpctl import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported -from .utils import _compare_dtypes, _no_complex_dtypes, _usm_types +from .utils import ( + _compare_dtypes, + _integral_dtypes, + _no_complex_dtypes, + _usm_types, +) -@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes) -@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes) +@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:]) +@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:]) def test_floor_divide_dtype_matrix(op1_dtype, op2_dtype): q = get_queue_or_skip() skip_if_dtype_not_supported(op1_dtype, q) @@ -133,7 +139,7 @@ def test_floor_divide_broadcasting(): assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all() -@pytest.mark.parametrize("arr_dt", _no_complex_dtypes) +@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:]) def test_floor_divide_python_scalar(arr_dt): q = get_queue_or_skip() skip_if_dtype_not_supported(arr_dt, q) @@ -204,7 +210,7 @@ def test_floor_divide_gh_1247(): ) -@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:9]) +@pytest.mark.parametrize("dtype", _integral_dtypes) def test_floor_divide_integer_zero(dtype): q = get_queue_or_skip() skip_if_dtype_not_supported(dtype, q) @@ -255,3 +261,59 @@ def test_floor_divide_special_cases(): res = dpt.floor_divide(x, y) res_np = np.floor_divide(dpt.asnumpy(x), dpt.asnumpy(y)) np.testing.assert_array_equal(dpt.asnumpy(res), res_np) + + +@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:]) +def test_divide_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind in "ui": + X //= int(1) + elif dt_kind == "f": + X //= float(1) + + +@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:]) +@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:]) +def test_floor_divide_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + # out array only valid if it is inexact + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64): + ar1 //= ar2 + assert dpt.all(ar1 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 //= ar4 + assert dpt.all(ar3 == 1) + else: + with pytest.raises(TypeError): + ar1 //= ar2 + dpt.floor_divide(ar1, ar2, out=ar1) + + # out is second arg + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64): + dpt.floor_divide(ar1, ar2, out=ar2) + assert dpt.all(ar2 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + dpt.floor_divide(ar3, ar4, out=ar4) + dpt.all(ar4 == 1) + else: + with pytest.raises(TypeError): + dpt.floor_divide(ar1, ar2, out=ar2) diff --git a/dpctl/tests/elementwise/test_pow.py b/dpctl/tests/elementwise/test_pow.py index 1f13e2b533..8b76e3a9fc 100644 --- a/dpctl/tests/elementwise/test_pow.py +++ b/dpctl/tests/elementwise/test_pow.py @@ -21,6 +21,7 @@ import dpctl import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported from .utils import _all_dtypes, _compare_dtypes, _usm_types @@ -152,3 +153,60 @@ def test_pow_python_scalar(arr_dt): assert isinstance(R, dpt.usm_ndarray) R = dpt.pow(sc, X) assert isinstance(R, dpt.usm_ndarray) + + +@pytest.mark.parametrize("dtype", _all_dtypes[1:]) +def test_pow_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.ones((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind in "ui": + X **= int(1) + elif dt_kind == "f": + X **= float(1) + elif dt_kind == "c": + X **= complex(1) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:]) +@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:]) +def test_pow_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64): + ar1 **= ar2 + assert ( + dpt.asnumpy(ar1) == np.full(ar1.shape, 1, dtype=ar1.dtype) + ).all() + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + ar3[::-1] *= ar4[::2] + assert ( + dpt.asnumpy(ar3) == np.full(ar3.shape, 1, dtype=ar3.dtype) + ).all() + + else: + with pytest.raises(TypeError): + ar1 **= ar2 + + +def test_pow_inplace_basic(): + get_queue_or_skip() + + x = dpt.arange(10, dtype="i4") + expected = dpt.square(x) + x **= 2 + + assert dpt.all(x == expected) diff --git a/dpctl/tests/elementwise/test_remainder.py b/dpctl/tests/elementwise/test_remainder.py index def594f269..47500954a2 100644 --- a/dpctl/tests/elementwise/test_remainder.py +++ b/dpctl/tests/elementwise/test_remainder.py @@ -21,6 +21,7 @@ import dpctl import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported from .utils import _compare_dtypes, _no_complex_dtypes, _usm_types @@ -206,3 +207,54 @@ def test_remainder_python_scalar(arr_dt): assert isinstance(R, dpt.usm_ndarray) R = dpt.remainder(sc, X) assert isinstance(R, dpt.usm_ndarray) + + +@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:]) +def test_remainder_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.ones((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind in "ui": + X %= int(1) + elif dt_kind == "f": + X %= float(1) + + +@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:]) +@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:]) +def test_remainder_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64): + ar1 %= ar2 + assert dpt.all(ar1 == dpt.zeros(ar1.shape, dtype=ar1.dtype)) + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + ar3[::-1] %= ar4[::2] + assert dpt.all(ar3 == dpt.zeros(ar3.shape, dtype=ar3.dtype)) + + else: + with pytest.raises(TypeError): + ar1 %= ar2 + + +def test_remainder_inplace_basic(): + get_queue_or_skip() + + x = dpt.arange(10, dtype="i4") + expected = x & 1 + x %= 2 + + assert dpt.all(x == expected) diff --git a/dpctl/tests/elementwise/test_rsqrt.py b/dpctl/tests/elementwise/test_rsqrt.py new file mode 100644 index 0000000000..ef9378ade2 --- /dev/null +++ b/dpctl/tests/elementwise/test_rsqrt.py @@ -0,0 +1,74 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _map_to_device_dtype, _no_complex_dtypes, _real_fp_dtypes + + +@pytest.mark.parametrize("dtype", _no_complex_dtypes) +def test_rsqrt_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.asarray(1, dtype=dtype, sycl_queue=q) + expected_dtype = np.reciprocal(np.sqrt(np.array(1, dtype=dtype))).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.rsqrt(x).dtype == expected_dtype + + +@pytest.mark.parametrize("dtype", _real_fp_dtypes) +def test_rsqrt_output_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 1027 + + x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q) + res = dpt.rsqrt(x) + expected = np.reciprocal(np.sqrt(dpt.asnumpy(x), dtype=dtype)) + tol = 8 * dpt.finfo(res.dtype).resolution + assert_allclose(dpt.asnumpy(res), expected, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _real_fp_dtypes) +def test_rsqrt_output_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 2054 + + x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2] + res = dpt.rsqrt(x) + expected = np.reciprocal(np.sqrt(dpt.asnumpy(x), dtype=dtype)) + tol = 8 * dpt.finfo(res.dtype).resolution + assert_allclose(dpt.asnumpy(res), expected, atol=tol, rtol=tol) + + +def test_rsqrt_special_cases(): + get_queue_or_skip() + + x = dpt.asarray([dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4") + res = dpt.rsqrt(x) + expected = dpt.asarray( + [dpt.nan, dpt.nan, dpt.inf, -dpt.inf, 0.0, dpt.nan], dtype="f4" + ) + assert dpt.allclose(res, expected, equal_nan=True) diff --git a/dpctl/tests/test_sycl_event.py b/dpctl/tests/test_sycl_event.py index fa496d1bb8..7f0db07539 100644 --- a/dpctl/tests/test_sycl_event.py +++ b/dpctl/tests/test_sycl_event.py @@ -202,7 +202,12 @@ def test_sycl_timer(): m1.copy_from_device(m2) # host operation [x**2 for x in range(128 * 1024)] - host_dt, device_dt = timer.dt + elapsed = timer.dt + host_dt, device_dt = elapsed + assert isinstance(repr(elapsed), str) + assert isinstance(str(elapsed), str) + assert host_dt == elapsed.host_dt + assert device_dt == elapsed.device_dt assert host_dt > device_dt or (host_dt > 0 and device_dt >= 0) q_no_profiling = dpctl.SyclQueue() assert q_no_profiling.has_enable_profiling is False diff --git a/dpctl/tests/test_sycl_kernel_submit.py b/dpctl/tests/test_sycl_kernel_submit.py index d15f5c8e2b..697af32f5c 100644 --- a/dpctl/tests/test_sycl_kernel_submit.py +++ b/dpctl/tests/test_sycl_kernel_submit.py @@ -114,7 +114,7 @@ def test_create_program_from_source(ctype_str, dtype, ctypes_ctor): ) -def test_async_submit(): +def test_submit_async(): try: q = dpctl.SyclQueue("opencl") except dpctl.SyclQueueCreationError: @@ -182,7 +182,7 @@ def test_async_submit(): async_detected = False for attempt in range(5): - e1 = q.submit( + e1 = q.submit_async( kern1Kernel, [ first_row, @@ -192,7 +192,7 @@ def test_async_submit(): n, ], ) - e2 = q.submit( + e2 = q.submit_async( kern2Kernel, [ second_row, @@ -202,7 +202,7 @@ def test_async_submit(): n, ], ) - e3 = q.submit( + e3 = q.submit_async( kern3Kernel, [third_row, first_row, second_row], [ @@ -214,6 +214,9 @@ def test_async_submit(): e3_st = e3.execution_status e2_st = e2.execution_status e1_st = e1.execution_status + ht_e = q._submit_keep_args_alive( + [first_row, second_row, third_row], [e1, e2, e3] + ) are_complete = [ e == status_complete for e in ( @@ -223,6 +226,7 @@ def test_async_submit(): ) ] e3.wait() + ht_e.wait() if not all(are_complete): async_detected = True break diff --git a/dpctl/tests/test_sycl_queue_memcpy.py b/dpctl/tests/test_sycl_queue_memcpy.py index 45c8e41f61..e678b73f03 100644 --- a/dpctl/tests/test_sycl_queue_memcpy.py +++ b/dpctl/tests/test_sycl_queue_memcpy.py @@ -44,7 +44,77 @@ def test_memcpy_copy_usm_to_usm(): q.memcpy(mobj2, mobj1, 3) - assert mv2[:3], b"123" + assert mv2[:3] == b"123" + + +def test_memcpy_copy_host_to_usm(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Default constructor for SyclQueue failed") + usm_obj = _create_memory(q) + + canary = bytearray(b"123456789") + host_obj = memoryview(canary) + + q.memcpy(usm_obj, host_obj, len(canary)) + + mv2 = memoryview(usm_obj) + + assert mv2[: len(canary)] == canary + + +def test_memcpy_copy_usm_to_host(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Default constructor for SyclQueue failed") + usm_obj = _create_memory(q) + mv2 = memoryview(usm_obj) + + n = 9 + for id in range(n): + mv2[id] = ord("a") + id + + host_obj = bytearray(b" " * n) + + q.memcpy(host_obj, usm_obj, n) + + assert host_obj == b"abcdefghi" + + +def test_memcpy_copy_host_to_host(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Default constructor for SyclQueue failed") + + src_buf = b"abcdefghijklmnopqrstuvwxyz" + dst_buf = bytearray(len(src_buf)) + + q.memcpy(dst_buf, src_buf, len(src_buf)) + + assert dst_buf == src_buf + + +def test_memcpy_async(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Default constructor for SyclQueue failed") + + src_buf = b"abcdefghijklmnopqrstuvwxyz" + n = len(src_buf) + dst_buf = bytearray(n) + dst_buf2 = bytearray(n) + + e = q.memcpy_async(dst_buf, src_buf, n) + e2 = q.memcpy_async(dst_buf2, src_buf, n, [e]) + + e.wait() + e2.wait() + assert dst_buf == src_buf + assert dst_buf2 == src_buf def test_memcpy_type_error(): @@ -56,8 +126,8 @@ def test_memcpy_type_error(): with pytest.raises(TypeError) as cm: q.memcpy(None, mobj, 3) - assert "`dest`" in str(cm.value) + assert "_Memory" in str(cm.value) with pytest.raises(TypeError) as cm: q.memcpy(mobj, None, 3) - assert "`src`" in str(cm.value) + assert "_Memory" in str(cm.value) diff --git a/dpctl/tests/test_tensor_array_api_inspection.py b/dpctl/tests/test_tensor_array_api_inspection.py new file mode 100644 index 0000000000..5ae0d35f8e --- /dev/null +++ b/dpctl/tests/test_tensor_array_api_inspection.py @@ -0,0 +1,163 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import dpctl +import dpctl.tensor as dpt +from dpctl.tensor._tensor_impl import ( + default_device_complex_type, + default_device_fp_type, + default_device_index_type, + default_device_int_type, +) + +_dtypes_no_fp16_fp64 = { + "bool": dpt.bool, + "float32": dpt.float32, + "complex64": dpt.complex64, + "complex128": dpt.complex128, + "int8": dpt.int8, + "int16": dpt.int16, + "int32": dpt.int32, + "int64": dpt.int64, + "uint8": dpt.uint8, + "uint16": dpt.uint16, + "uint32": dpt.uint32, + "uint64": dpt.uint64, +} + + +class MockDevice: + def __init__(self, fp16: bool, fp64: bool): + self.has_aspect_fp16 = fp16 + self.has_aspect_fp64 = fp64 + + +def test_array_api_inspection_methods(): + info = dpt.__array_namespace_info__() + assert info.capabilities() + assert info.default_device() + assert info.default_dtypes() + assert info.devices() + assert info.dtypes() + + +def test_array_api_inspection_default_device(): + assert ( + dpt.__array_namespace_info__().default_device() + == dpctl.select_default_device() + ) + + +def test_array_api_inspection_devices(): + devices1 = dpt.__array_namespace_info__().devices() + devices2 = dpctl.get_devices() + assert len(devices1) == len(devices2) + assert devices1 == devices2 + + +def test_array_api_inspection_capabilities(): + capabilities = dpt.__array_namespace_info__().capabilities() + assert capabilities["boolean_indexing"] + assert capabilities["data_dependent_shapes"] + + +def test_array_api_inspection_default_dtypes(): + dev = dpctl.select_default_device() + + int_dt = default_device_int_type(dev) + ind_dt = default_device_index_type(dev) + fp_dt = default_device_fp_type(dev) + cm_dt = default_device_complex_type(dev) + + info = dpt.__array_namespace_info__() + default_dts_nodev = info.default_dtypes() + default_dts_dev = info.default_dtypes(dev) + + assert ( + int_dt == default_dts_nodev["integral"] == default_dts_dev["integral"] + ) + assert ( + ind_dt == default_dts_nodev["indexing"] == default_dts_dev["indexing"] + ) + assert ( + fp_dt + == default_dts_nodev["real floating"] + == default_dts_dev["real floating"] + ) + assert ( + cm_dt + == default_dts_nodev["complex floating"] + == default_dts_dev["complex floating"] + ) + + +def test_array_api_inspection_default_device_dtypes(): + dev = dpctl.select_default_device() + dtypes = _dtypes_no_fp16_fp64.copy() + if dev.has_aspect_fp64: + dtypes["float64"] = dpt.float64 + + assert dtypes == dpt.__array_namespace_info__().dtypes() + + +@pytest.mark.parametrize("fp16", [True, False]) +@pytest.mark.parametrize("fp64", [True, False]) +def test_array_api_inspection_device_dtypes(fp16, fp64): + dev = MockDevice(fp16, fp64) + dtypes = _dtypes_no_fp16_fp64.copy() + if fp64: + dtypes["float64"] = dpt.float64 + + assert dtypes == dpt.__array_namespace_info__().dtypes(device=dev) + + +def test_array_api_inspection_dtype_kind(): + info = dpt.__array_namespace_info__() + + f_dtypes = info.dtypes(kind="real floating") + assert all([_dt[1].kind == "f" for _dt in f_dtypes.items()]) + + i_dtypes = info.dtypes(kind="signed integer") + assert all([_dt[1].kind == "i" for _dt in i_dtypes.items()]) + + u_dtypes = info.dtypes(kind="unsigned integer") + assert all([_dt[1].kind == "u" for _dt in u_dtypes.items()]) + + ui_dtypes = info.dtypes(kind="unsigned integer") + assert all([_dt[1].kind in "ui" for _dt in ui_dtypes.items()]) + + c_dtypes = info.dtypes(kind="complex floating") + assert all([_dt[1].kind == "c" for _dt in c_dtypes.items()]) + + assert info.dtypes(kind="bool") == {"bool": dpt.bool} + + _signed_ints = { + "int8": dpt.int8, + "int16": dpt.int16, + "int32": dpt.int32, + "int64": dpt.int64, + } + assert ( + info.dtypes(kind=("signed integer", "signed integer")) == _signed_ints + ) + assert ( + info.dtypes( + kind=("integral", "bool", "real floating", "complex floating") + ) + == info.dtypes() + ) diff --git a/dpctl/tests/test_tensor_clip.py b/dpctl/tests/test_tensor_clip.py new file mode 100644 index 0000000000..7050b17e7c --- /dev/null +++ b/dpctl/tests/test_tensor_clip.py @@ -0,0 +1,627 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +from helper import get_queue_or_skip, skip_if_dtype_not_supported +from numpy.testing import assert_raises_regex + +import dpctl +import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast +from dpctl.utils import ExecutionPlacementError + +_all_dtypes = [ + "?", + "u1", + "i1", + "u2", + "i2", + "u4", + "i4", + "u8", + "i8", + "e", + "f", + "d", + "F", + "D", +] + +_usm_types = ["device", "shared", "host"] + + +@pytest.mark.parametrize("dt1", _all_dtypes) +@pytest.mark.parametrize("dt2", _all_dtypes) +def test_clip_dtypes(dt1, dt2): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt1, q) + skip_if_dtype_not_supported(dt2, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=dt1, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=dt1, sycl_queue=q) + ar3 = dpt.ones_like(ar1, dtype=dt2, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + # also covers cases where dt1 == dt2 + if _can_cast(ar3.dtype, ar1.dtype, _fp16, _fp64): + r = dpt.clip(ar1, ar2, ar3) + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == ar1.dtype + assert r.shape == ar1.shape + assert dpt.all(r == ar1) + assert r.sycl_queue == ar1.sycl_queue + + r = dpt.clip(ar1, min=ar3, max=None) + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == ar1.dtype + assert r.shape == ar1.shape + assert dpt.all(r == ar1) + assert r.sycl_queue == ar1.sycl_queue + + r = dpt.clip(ar1, min=None, max=ar3) + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == ar1.dtype + assert r.shape == ar1.shape + assert dpt.all(r == ar1) + assert r.sycl_queue == ar1.sycl_queue + else: + with pytest.raises(ValueError): + dpt.clip(ar1, ar2, ar3) + with pytest.raises(ValueError): + dpt.clip(ar1, min=ar3, max=None) + with pytest.raises(ValueError): + dpt.clip(ar1, min=None, max=ar3) + + +def test_clip_empty(): + get_queue_or_skip() + + x = dpt.empty((2, 0, 3), dtype="i4") + a_min = dpt.ones((2, 0, 3), dtype="i4") + a_max = dpt.ones((2, 0, 3), dtype="i4") + + r = dpt.clip(x, a_min, a_max) + assert r.size == 0 + assert r.shape == x.shape + + +def test_clip_python_scalars(): + get_queue_or_skip() + + arrs = [ + dpt.ones(1, dtype="?"), + dpt.ones(1, dtype="i4"), + dpt.ones(1, dtype="f4"), + dpt.ones(1, dtype="c8"), + ] + + py_zeros = [ + False, + 0, + 0.0, + complex(0, 0), + ] + + py_ones = [ + True, + 1, + 1.0, + complex(1, 0), + ] + + for zero, one, arr in zip(py_zeros, py_ones, arrs): + r = dpt.clip(arr, zero, one) + assert isinstance(r, dpt.usm_ndarray) + r = dpt.clip(arr, min=zero) + assert isinstance(r, dpt.usm_ndarray) + + +def test_clip_in_place(): + get_queue_or_skip() + + x = dpt.arange(10, dtype="i4") + a_min = dpt.arange(1, 11, dtype="i4") + a_max = dpt.arange(2, 12, dtype="i4") + dpt.clip(x, a_min, a_max, out=x) + assert dpt.all(x == a_min) + + x = dpt.arange(10, dtype="i4") + dpt.clip(x, min=a_min, max=None, out=x) + assert dpt.all(x == a_min) + + x = dpt.arange(10, dtype="i4") + dpt.clip(x, a_min, a_max, out=a_max) + assert dpt.all(a_max == a_min) + + a_min = dpt.arange(1, 11, dtype="i4") + dpt.clip(x, min=a_min, max=None, out=a_min[::-1]) + assert dpt.all((x + 1)[::-1] == a_min) + + +def test_clip_special_cases(): + get_queue_or_skip() + + x = dpt.arange(10, dtype="f4") + r = dpt.clip(x, -dpt.inf, dpt.inf) + assert dpt.all(r == x) + r = dpt.clip(x, dpt.nan, dpt.inf) + assert dpt.all(dpt.isnan(r)) + r = dpt.clip(x, -dpt.inf, dpt.nan) + assert dpt.all(dpt.isnan(r)) + + +def test_clip_out_need_temporary(): + get_queue_or_skip() + + x = dpt.ones(10, dtype="i4") + a_min = dpt.asarray(2, dtype="i4") + a_max = dpt.asarray(3, dtype="i4") + dpt.clip(x[:6], 2, 3, out=x[-6:]) + assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2) + + x = dpt.ones(10, dtype="i4") + a_min = dpt.asarray(2, dtype="i4") + a_max = dpt.asarray(3, dtype="i2") + dpt.clip(x[:6], 2, 3, out=x[-6:]) + assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2) + + x = dpt.ones(10, dtype="i4") + a_min = dpt.asarray(2, dtype="i2") + a_max = dpt.asarray(3, dtype="i4") + dpt.clip(x[:6], 2, 3, out=x[-6:]) + assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2) + + x = dpt.ones(10, dtype="i4") + a_min = dpt.asarray(2, dtype="i2") + a_max = dpt.asarray(3, dtype="i1") + dpt.clip(x[:6], 2, 3, out=x[-6:]) + assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2) + + x = dpt.full(6, 3, dtype="i4") + a_min = dpt.full(10, 2, dtype="i4") + a_max = dpt.asarray(4, dtype="i4") + dpt.clip(x, min=a_min[:6], max=a_max, out=a_min[-6:]) + assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3) + + x = dpt.full(6, 3, dtype="i4") + a_min = dpt.full(10, 2, dtype="i4") + a_max = dpt.asarray(4, dtype="i2") + dpt.clip(x, min=a_min[:6], max=a_max, out=a_min[-6:]) + assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3) + + +def test_clip_out_need_temporary_none(): + get_queue_or_skip() + + x = dpt.full(6, 3, dtype="i4") + # with min/max == None + a_min = dpt.full(10, 2, dtype="i4") + dpt.clip(x, min=a_min[:6], max=None, out=a_min[-6:]) + assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3) + + +def test_clip_arg_validation(): + get_queue_or_skip() + + check = dict() + x1 = dpt.empty((1,), dtype="i4") + x2 = dpt.empty((1,), dtype="i4") + + with pytest.raises(TypeError): + dpt.clip(check, x1, x2) + + +@pytest.mark.parametrize( + "dt1,dt2", [("i4", "i4"), ("i4", "i2"), ("i2", "i4"), ("i1", "i2")] +) +def test_clip_order(dt1, dt2): + get_queue_or_skip() + + test_shape = ( + 20, + 20, + ) + test_shape2 = tuple(2 * dim for dim in test_shape) + n = test_shape[-1] + + ar1 = dpt.ones(test_shape, dtype="i4", order="C") + ar2 = dpt.ones(test_shape, dtype=dt1, order="C") + ar3 = dpt.ones(test_shape, dtype=dt2, order="C") + r1 = dpt.clip(ar1, ar2, ar3, order="C") + assert r1.flags.c_contiguous + r2 = dpt.clip(ar1, ar2, ar3, order="F") + assert r2.flags.f_contiguous + r3 = dpt.clip(ar1, ar2, ar3, order="A") + assert r3.flags.c_contiguous + r4 = dpt.clip(ar1, ar2, ar3, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones(test_shape, dtype="i4", order="F") + ar2 = dpt.ones(test_shape, dtype=dt1, order="F") + ar3 = dpt.ones(test_shape, dtype=dt2, order="F") + r1 = dpt.clip(ar1, ar2, ar3, order="C") + assert r1.flags.c_contiguous + r2 = dpt.clip(ar1, ar2, ar3, order="F") + assert r2.flags.f_contiguous + r3 = dpt.clip(ar1, ar2, ar3, order="A") + assert r3.flags.f_contiguous + r4 = dpt.clip(ar1, ar2, ar3, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2] + ar3 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2] + r4 = dpt.clip(ar1, ar2, ar3, order="K") + assert r4.strides == (n, -1) + r5 = dpt.clip(ar1, ar2, ar3, order="C") + assert r5.strides == (n, 1) + + ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT + ar3 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT + r4 = dpt.clip(ar1, ar2, ar3, order="K") + assert r4.strides == (-1, n) + r5 = dpt.clip(ar1, ar2, ar3, order="C") + assert r5.strides == (n, 1) + + +@pytest.mark.parametrize("dt", ["i4", "i2"]) +def test_clip_none_order(dt): + get_queue_or_skip() + + test_shape = ( + 20, + 20, + ) + test_shape2 = tuple(2 * dim for dim in test_shape) + n = test_shape[-1] + + ar1 = dpt.ones(test_shape, dtype="i4", order="C") + ar2 = dpt.ones(test_shape, dtype=dt, order="C") + + r1 = dpt.clip(ar1, min=None, max=ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.clip(ar1, min=None, max=ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.clip(ar1, min=None, max=ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.clip(ar1, min=None, max=ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones(test_shape, dtype="i4", order="F") + ar2 = dpt.ones(test_shape, dtype=dt, order="F") + + r1 = dpt.clip(ar1, min=None, max=ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.clip(ar1, min=None, max=ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.clip(ar1, min=None, max=ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.clip(ar1, min=None, max=ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones(test_shape2, dtype=dt, order="C")[:20, ::-2] + + r4 = dpt.clip(ar1, min=None, max=ar2, order="K") + assert r4.strides == (n, -1) + r5 = dpt.clip(ar1, min=None, max=ar2, order="C") + assert r5.strides == (n, 1) + + ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones(test_shape2, dtype=dt, order="C")[:20, ::-2].mT + + r4 = dpt.clip(ar1, min=None, max=ar2, order="K") + assert r4.strides == (-1, n) + r5 = dpt.clip(ar1, min=None, max=ar2, order="C") + assert r5.strides == (n, 1) + + +@pytest.mark.parametrize("usm_type1", _usm_types) +@pytest.mark.parametrize("usm_type2", _usm_types) +@pytest.mark.parametrize("usm_type3", _usm_types) +def test_clip_usm_type_matrix(usm_type1, usm_type2, usm_type3): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=usm_type1) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type2) + ar3 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type3) + + r = dpt.clip(ar1, ar2, ar3) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpctl.utils.get_coerced_usm_type( + (usm_type1, usm_type2, usm_type3) + ) + assert r.usm_type == expected_usm_type + + +@pytest.mark.parametrize("usm_type1", _usm_types) +@pytest.mark.parametrize("usm_type2", _usm_types) +def test_clip_usm_type_matrix_none_arg(usm_type1, usm_type2): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=usm_type1) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type2) + + r = dpt.clip(ar1, min=ar2, max=None) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpctl.utils.get_coerced_usm_type((usm_type1, usm_type2)) + assert r.usm_type == expected_usm_type + + +def test_clip_dtype_error(): + get_queue_or_skip() + + ar1 = dpt.ones(1, dtype="i4") + ar2 = dpt.ones(1, dtype="i4") + ar3 = dpt.ones(1, dtype="i4") + ar4 = dpt.empty_like(ar1, dtype="f4") + + assert_raises_regex( + ValueError, + "Output array of type.*is needed", + dpt.clip, + ar1, + ar2, + ar3, + ar4, + ) + assert_raises_regex( + ValueError, + "Output array of type.*is needed", + dpt.clip, + ar1, + ar2, + None, + ar4, + ) + + +def test_clip_errors(): + get_queue_or_skip() + try: + gpu_queue = dpctl.SyclQueue("gpu") + except dpctl.SyclQueueCreationError: + pytest.skip("SyclQueue('gpu') failed, skipping") + try: + cpu_queue = dpctl.SyclQueue("cpu") + except dpctl.SyclQueueCreationError: + pytest.skip("SyclQueue('cpu') failed, skipping") + + ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue) + ar2 = dpt.ones_like(ar1, sycl_queue=gpu_queue) + ar3 = dpt.ones_like(ar1, sycl_queue=gpu_queue) + ar4 = dpt.empty_like(ar1, sycl_queue=cpu_queue) + assert_raises_regex( + ExecutionPlacementError, + "Input and output allocation queues are not compatible", + dpt.clip, + ar1, + ar2, + ar3, + ar4, + ) + + assert_raises_regex( + ExecutionPlacementError, + "Input and output allocation queues are not compatible", + dpt.clip, + ar1, + None, + ar3, + ar4, + ) + + assert_raises_regex( + ExecutionPlacementError, + "Execution placement can not be unambiguously inferred from input " + "arguments.", + dpt.clip, + ar1, + ar4, + ar2, + ar3, + ) + + assert_raises_regex( + ExecutionPlacementError, + "Execution placement can not be unambiguously inferred from input " + "arguments.", + dpt.clip, + ar1, + ar4, + 1, + ar3, + ) + + assert_raises_regex( + ExecutionPlacementError, + "Execution placement can not be unambiguously inferred from input " + "arguments.", + dpt.clip, + ar1, + 1, + ar4, + ar3, + ) + + assert_raises_regex( + ExecutionPlacementError, + "Execution placement can not be unambiguously inferred from input " + "arguments.", + dpt.clip, + ar1, + ar4, + None, + ar2, + ) + + ar1 = dpt.ones(2, dtype="float32") + ar2 = dpt.ones_like(ar1, dtype="float32") + ar3 = dpt.ones_like(ar1, dtype="float32") + ar4 = dpt.empty(3, dtype="float32") + assert_raises_regex( + ValueError, + "The shape of input and output arrays are inconsistent", + dpt.clip, + ar1, + ar2, + ar3, + ar4, + ) + + assert_raises_regex( + ValueError, + "The shape of input and output arrays are inconsistent", + dpt.clip, + ar1, + ar2, + None, + ar4, + ) + + ar1 = np.ones(2, dtype="f4") + ar2 = dpt.ones(2, dtype="f4") + ar3 = dpt.ones(2, dtype="f4") + assert_raises_regex( + TypeError, + "Expected `x` to be of dpctl.tensor.usm_ndarray type*", + dpt.clip, + ar1, + ar2, + ar3, + ) + + ar1 = dpt.ones(2, dtype="i4") + ar2 = dpt.ones_like(ar1, dtype="i4") + ar3 = dpt.ones_like(ar1, dtype="i4") + ar4 = np.empty_like(ar1) + assert_raises_regex( + TypeError, + "output array must be of usm_ndarray type", + dpt.clip, + ar1, + ar2, + ar3, + ar4, + ) + + assert_raises_regex( + TypeError, + "output array must be of usm_ndarray type", + dpt.clip, + ar1, + ar2, + None, + ar4, + ) + + +def test_clip_out_type_check(): + get_queue_or_skip() + + x1 = dpt.ones(10) + x2 = dpt.ones(10) + x3 = dpt.ones(10) + + out = range(10) + + with pytest.raises(TypeError): + dpt.clip(x1, x2, x3, out=out) + + +@pytest.mark.parametrize("dt", ["i4", "f4", "c8"]) +def test_clip_basic(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + sz = 1026 + x = dpt.arange(sz, dtype=dt, sycl_queue=q) + r = dpt.clip(x, min=100, max=500) + expected = dpt.arange(sz, dtype=dt, sycl_queue=q) + expected[:100] = 100 + expected[500:] = 500 + assert dpt.all(expected == r) + + x = dpt.zeros(sz, dtype=dt, sycl_queue=q) + a_max = dpt.full(sz, -1, dtype=dt, sycl_queue=q) + a_max[::2] = -2 + r = dpt.clip(x, min=-3, max=a_max) + assert dpt.all(a_max == r) + + +@pytest.mark.parametrize("dt", ["i4", "f4", "c8"]) +def test_clip_strided(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + sz = 2 * 1026 + x = dpt.arange(sz, dtype=dt, sycl_queue=q)[::-2] + r = dpt.clip(x, min=100, max=500) + expected = dpt.arange(sz, dtype=dt, sycl_queue=q) + expected[:100] = 100 + expected[500:] = 500 + expected = expected[::-2] + assert dpt.all(expected == r) + + x = dpt.zeros(sz, dtype=dt, sycl_queue=q)[::-2] + a_max = dpt.full(sz, -1, dtype=dt, sycl_queue=q) + a_max[::2] = -2 + a_max = a_max[::-2] + r = dpt.clip(x, min=-3, max=a_max) + assert dpt.all(a_max == r) + + +def test_clip_max_less_than_min(): + get_queue_or_skip() + + x = dpt.ones(10, dtype="i4") + res = dpt.clip(x, 5, 0) + assert dpt.all(res == 0) + + +def test_clip_minmax_weak_types(): + get_queue_or_skip() + + x = dpt.zeros(10, dtype=dpt.bool) + min_list = [False, 0, 0.0, 0.0 + 0.0j] + max_list = [True, 1, 1.0, 1.0 + 0.0j] + for min_v, max_v in zip(min_list, max_list): + if isinstance(min_v, bool) and isinstance(max_v, bool): + y = dpt.clip(x, min_v, max_v) + assert isinstance(y, dpt.usm_ndarray) + else: + with pytest.raises(ValueError): + dpt.clip(x, min_v, max_v) + + +def test_clip_max_weak_types(): + get_queue_or_skip() + + x = dpt.zeros(10, dtype="i4") + m = dpt.ones(10, dtype="i4") + + with pytest.raises(ValueError): + dpt.clip(x, m, 2.5) + + with pytest.raises(ValueError): + dpt.clip(x, 2.5, m) diff --git a/dpctl/tests/test_tensor_statistical_functions.py b/dpctl/tests/test_tensor_statistical_functions.py new file mode 100644 index 0000000000..8916833f86 --- /dev/null +++ b/dpctl/tests/test_tensor_statistical_functions.py @@ -0,0 +1,254 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import dpctl.tensor as dpt +from dpctl.tensor._tensor_impl import default_device_fp_type +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +_no_complex_dtypes = [ + "?", + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", +] + + +@pytest.mark.parametrize("dt", _no_complex_dtypes) +def test_mean_dtypes(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x = dpt.ones(10, dtype=dt) + res = dpt.mean(x) + assert res == 1 + if x.dtype.kind in "biu": + assert res.dtype == dpt.dtype(default_device_fp_type(q)) + else: + assert res.dtype == x.dtype + + +@pytest.mark.parametrize("dt", _no_complex_dtypes) +@pytest.mark.parametrize("py_zero", [float(0), int(0)]) +def test_std_var_dtypes(dt, py_zero): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x = dpt.ones(10, dtype=dt) + res = dpt.std(x, correction=py_zero) + assert res == 0 + if x.dtype.kind in "biu": + assert res.dtype == dpt.dtype(default_device_fp_type(q)) + else: + assert res.dtype == x.dtype + + res = dpt.var(x, correction=py_zero) + assert res == 0 + if x.dtype.kind in "biu": + assert res.dtype == dpt.dtype(default_device_fp_type(q)) + else: + assert res.dtype == x.dtype + + +def test_stat_fns_axis(): + get_queue_or_skip() + + x = dpt.ones((3, 4, 5, 6, 7), dtype="f4") + m = dpt.mean(x, axis=(1, 2, -1)) + + assert isinstance(m, dpt.usm_ndarray) + assert m.shape == (3, 6) + assert dpt.allclose(m, dpt.asarray(1, dtype=m.dtype)) + + s = dpt.var(x, axis=(1, 2, -1)) + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 6) + assert dpt.allclose(s, dpt.asarray(0, dtype=s.dtype)) + + +@pytest.mark.parametrize("fn", [dpt.mean, dpt.var]) +def test_stat_fns_empty(fn): + get_queue_or_skip() + x = dpt.empty((0,), dtype="f4") + r = fn(x) + assert r.shape == tuple() + assert dpt.isnan(r) + + x = dpt.empty((10, 0, 2), dtype="f4") + r = fn(x, axis=1) + assert r.shape == (10, 2) + assert dpt.all(dpt.isnan(r)) + + r = fn(x, axis=0) + assert r.shape == (0, 2) + assert r.size == 0 + + +def test_stat_fns_keepdims(): + get_queue_or_skip() + + x = dpt.ones((3, 4, 5, 6, 7), dtype="f4") + m = dpt.mean(x, axis=(1, 2, -1), keepdims=True) + + assert isinstance(m, dpt.usm_ndarray) + assert m.shape == (3, 1, 1, 6, 1) + assert dpt.allclose(m, dpt.asarray(1, dtype=m.dtype)) + + s = dpt.var(x, axis=(1, 2, -1), keepdims=True) + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 1, 1, 6, 1) + assert dpt.allclose(s, dpt.asarray(0, dtype=s.dtype)) + + +def test_stat_fns_empty_axis(): + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5)) + m = dpt.mean(x, axis=()) + + assert x.shape == m.shape + assert dpt.all(x == m) + + s = dpt.var(x, axis=()) + assert x.shape == s.shape + assert dpt.all(s == 0) + + d = dpt.std(x, axis=()) + assert x.shape == d.shape + assert dpt.all(d == 0) + + +def test_mean(): + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3)) + m = dpt.mean(x) + expected = dpt.asarray(4, dtype="f4") + assert dpt.allclose(m, expected) + + m = dpt.mean(x, axis=0) + expected = dpt.arange(3, 6, dtype="f4") + assert dpt.allclose(m, expected) + + m = dpt.mean(x, axis=1) + expected = dpt.asarray([1, 4, 7], dtype="f4") + assert dpt.allclose(m, expected) + + +def test_var_std(): + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3)) + r = dpt.var(x) + expected = dpt.asarray(6.666666507720947, dtype="f4") + assert dpt.allclose(r, expected) + + r1 = dpt.var(x, correction=3) + expected1 = dpt.asarray(10.0, dtype="f4") + assert dpt.allclose(r1, expected1) + + r = dpt.std(x) + expected = dpt.sqrt(expected) + assert dpt.allclose(r, expected) + + r1 = dpt.std(x, correction=3) + expected1 = dpt.sqrt(expected1) + assert dpt.allclose(r1, expected1) + + r = dpt.var(x, axis=0) + expected = dpt.full(x.shape[1], 6, dtype="f4") + assert dpt.allclose(r, expected) + + r1 = dpt.var(x, axis=0, correction=1) + expected1 = dpt.full(x.shape[1], 9, dtype="f4") + assert dpt.allclose(r1, expected1) + + r = dpt.std(x, axis=0) + expected = dpt.sqrt(expected) + assert dpt.allclose(r, expected) + + r1 = dpt.std(x, axis=0, correction=1) + expected1 = dpt.sqrt(expected1) + assert dpt.allclose(r1, expected1) + + r = dpt.var(x, axis=1) + expected = dpt.full(x.shape[0], 0.6666666865348816, dtype="f4") + assert dpt.allclose(r, expected) + + r1 = dpt.var(x, axis=1, correction=1) + expected1 = dpt.ones(x.shape[0], dtype="f4") + assert dpt.allclose(r1, expected1) + + r = dpt.std(x, axis=1) + expected = dpt.sqrt(expected) + assert dpt.allclose(r, expected) + + r1 = dpt.std(x, axis=1, correction=1) + expected1 = dpt.sqrt(expected1) + assert dpt.allclose(r1, expected1) + + +def test_var_axis_length_correction(): + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3)) + + r = dpt.var(x, correction=x.size) + assert dpt.isnan(r) + + r = dpt.var(x, axis=0, correction=x.shape[0]) + assert dpt.all(dpt.isnan(r)) + + r = dpt.var(x, axis=1, correction=x.shape[1]) + assert dpt.all(dpt.isnan(r)) + + +def test_stat_function_errors(): + d = dict() + with pytest.raises(TypeError): + dpt.var(d) + with pytest.raises(TypeError): + dpt.std(d) + with pytest.raises(TypeError): + dpt.mean(d) + + x = dpt.empty(1, dtype="f4") + with pytest.raises(TypeError): + dpt.var(x, axis=d) + with pytest.raises(TypeError): + dpt.std(x, axis=d) + with pytest.raises(TypeError): + dpt.mean(x, axis=d) + + with pytest.raises(TypeError): + dpt.var(x, correction=d) + with pytest.raises(TypeError): + dpt.std(x, correction=d) + + x = dpt.empty(1, dtype="c8") + with pytest.raises(ValueError): + dpt.var(x) + with pytest.raises(ValueError): + dpt.std(x) diff --git a/dpctl/tests/test_tensor_sum.py b/dpctl/tests/test_tensor_sum.py index 403a823324..33fe4a8b4f 100644 --- a/dpctl/tests/test_tensor_sum.py +++ b/dpctl/tests/test_tensor_sum.py @@ -14,10 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import pytest import dpctl.tensor as dpt +import dpctl.utils as du from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported _all_dtypes = [ @@ -36,7 +36,6 @@ "c8", "c16", ] -_usm_types = ["device", "shared", "host"] @pytest.mark.parametrize("arg_dtype", _all_dtypes) @@ -44,6 +43,7 @@ def test_sum_arg_dtype_default_output_dtype_matrix(arg_dtype): q = get_queue_or_skip() skip_if_dtype_not_supported(arg_dtype, q) + # test reduction for C-contiguous input m = dpt.ones(100, dtype=arg_dtype) r = dpt.sum(m) @@ -56,11 +56,19 @@ def test_sum_arg_dtype_default_output_dtype_matrix(arg_dtype): assert r.dtype.kind == "f" elif m.dtype.kind == "c": assert r.dtype.kind == "c" - assert (dpt.asnumpy(r) == 100).all() + assert dpt.all(r == 100) + + # test reduction for strided input m = dpt.ones(200, dtype=arg_dtype)[:1:-2] r = dpt.sum(m) - assert (dpt.asnumpy(r) == 99).all() + assert dpt.all(r == 99) + + # test reduction for strided input which can be simplified + # to contiguous computation + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.sum(dpt.flip(m)) + assert dpt.all(r == 100) @pytest.mark.parametrize("arg_dtype", _all_dtypes) @@ -75,7 +83,7 @@ def test_sum_arg_out_dtype_matrix(arg_dtype, out_dtype): assert isinstance(r, dpt.usm_ndarray) assert r.dtype == dpt.dtype(out_dtype) - assert (dpt.asnumpy(r) == 100).all() + assert dpt.all(r == 100) def test_sum_empty(): @@ -94,7 +102,7 @@ def test_sum_axis(): assert isinstance(s, dpt.usm_ndarray) assert s.shape == (3, 6) - assert (dpt.asnumpy(s) == np.full(s.shape, 4 * 5 * 7)).all() + assert dpt.all(s == dpt.asarray(4 * 5 * 7, dtype="i4")) def test_sum_keepdims(): @@ -105,7 +113,7 @@ def test_sum_keepdims(): assert isinstance(s, dpt.usm_ndarray) assert s.shape == (3, 1, 1, 6, 1) - assert (dpt.asnumpy(s) == np.full(s.shape, 4 * 5 * 7)).all() + assert dpt.all(s == dpt.asarray(4 * 5 * 7, dtype=s.dtype)) def test_sum_scalar(): @@ -117,7 +125,7 @@ def test_sum_scalar(): assert isinstance(s, dpt.usm_ndarray) assert m.sycl_queue == s.sycl_queue assert s.shape == () - assert dpt.asnumpy(s) == np.full((), 1) + assert s == dpt.full((), 1) @pytest.mark.parametrize("arg_dtype", _all_dtypes) @@ -132,7 +140,7 @@ def test_sum_arg_out_dtype_scalar(arg_dtype, out_dtype): assert isinstance(r, dpt.usm_ndarray) assert r.dtype == dpt.dtype(out_dtype) - assert dpt.asnumpy(r) == 1 + assert r == 1 def test_sum_keepdims_zero_size(): @@ -174,6 +182,21 @@ def test_largish_reduction(arg_dtype, n): assert dpt.all(dpt.equal(y1, n * m)) +@pytest.mark.parametrize("n", [1023, 1024, 1025]) +def test_largish_reduction_axis1_axis0(n): + get_queue_or_skip() + + m = 25 + x1 = dpt.ones((m, n), dtype="f4") + x2 = dpt.ones((n, m), dtype="f4") + + y1 = dpt.sum(x1, axis=1) + y2 = dpt.sum(x2, axis=0) + + assert dpt.all(y1 == n) + assert dpt.all(y2 == n) + + def test_axis0_bug(): "gh-1391" get_queue_or_skip() @@ -187,3 +210,131 @@ def test_axis0_bug(): expected = dpt.asarray([[0, 3], [1, 4], [2, 5]]) assert dpt.all(s == expected) + + +def test_sum_axis1_axis0(): + """See gh-1455""" + get_queue_or_skip() + + # The atomic case is checked in `test_usm_ndarray_reductions` + # This test checks the tree reduction path for correctness + x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5)) + + m = dpt.sum(x, axis=0) + expected = dpt.asarray( + [ + [60, 63, 66, 69, 72], + [75, 78, 81, 84, 87], + [90, 93, 96, 99, 102], + [105, 108, 111, 114, 117], + ], + dtype="f4", + ) + tol = dpt.finfo(m.dtype).resolution + assert dpt.allclose(m, expected, atol=tol, rtol=tol) + + x = dpt.flip(x, axis=2) + m = dpt.sum(x, axis=2) + expected = dpt.asarray( + [[10, 35, 60, 85], [110, 135, 160, 185], [210, 235, 260, 285]], + dtype="f4", + ) + assert dpt.allclose(m, expected, atol=tol, rtol=tol) + + +def _any_complex(dtypes): + return any(dpt.isdtype(dpt.dtype(dt), "complex floating") for dt in dtypes) + + +def _skip_on_this_device(sycl_dev): + device_mask = du.intel_device_info(sycl_dev).get("device_id", 0) & 0xFF00 + return device_mask in [0x3E00, 0x9B00] + + +@pytest.mark.parametrize("arg_dtype", _all_dtypes[1:]) +def test_prod_arg_dtype_default_output_dtype_matrix(arg_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + arg_dtype = dpt.dtype(arg_dtype) + if _any_complex((arg_dtype,)): + if _skip_on_this_device(q.sycl_device): + pytest.skip( + "Product reduction for complex output are known " + "to fail for Gen9 with 2024.0 compiler" + ) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.prod(m) + + assert isinstance(r, dpt.usm_ndarray) + if m.dtype.kind == "i": + assert r.dtype.kind == "i" + elif m.dtype.kind == "u": + assert r.dtype.kind == "u" + elif m.dtype.kind == "f": + assert r.dtype.kind == "f" + elif m.dtype.kind == "c": + assert r.dtype.kind == "c" + assert dpt.all(r == 1) + + if dpt.isdtype(m.dtype, "unsigned integer"): + m = dpt.tile(dpt.arange(1, 3, dtype=arg_dtype), 10)[:1:-2] + r = dpt.prod(m) + assert dpt.all(r == dpt.asarray(512, dtype=r.dtype)) + else: + m = dpt.full(200, -1, dtype=arg_dtype)[:1:-2] + r = dpt.prod(m) + assert dpt.all(r == dpt.asarray(-1, dtype=r.dtype)) + + +def test_prod_empty(): + get_queue_or_skip() + x = dpt.empty((0,), dtype="u1") + y = dpt.prod(x) + assert y.shape == tuple() + assert int(y) == 1 + + +def test_prod_axis(): + get_queue_or_skip() + + m = dpt.ones((3, 4, 5, 6, 7), dtype="i4") + s = dpt.prod(m, axis=(1, 2, -1)) + + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 6) + assert dpt.all(s == dpt.asarray(1, dtype="i4")) + + +@pytest.mark.parametrize("arg_dtype", _all_dtypes) +@pytest.mark.parametrize("out_dtype", _all_dtypes[1:]) +def test_prod_arg_out_dtype_matrix(arg_dtype, out_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + skip_if_dtype_not_supported(out_dtype, q) + + out_dtype = dpt.dtype(out_dtype) + arg_dtype = dpt.dtype(arg_dtype) + if _any_complex((arg_dtype, out_dtype)): + if _skip_on_this_device(q.sycl_device): + pytest.skip( + "Product reduction for complex output are known " + "to fail for Gen9 with 2024.0 compiler" + ) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.prod(m, dtype=out_dtype) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == dpt.dtype(out_dtype) + assert dpt.all(r == 1) + + +def test_gh_1468(): + "See https://github.com/IntelPython/dpctl/issues/1468" + get_queue_or_skip() + + a = dpt.full((2, 3, 4), 123456789, dtype=dpt.int32) + t = dpt.sum(a, dtype="f4") + assert t > 0 diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py index 72f5aabebb..095bbc5638 100644 --- a/dpctl/tests/test_usm_ndarray_ctor.py +++ b/dpctl/tests/test_usm_ndarray_ctor.py @@ -39,6 +39,7 @@ (2, 5, 2), (2, 2, 2, 2, 2, 2, 2, 2), 5, + np.int32(7), ], ) @pytest.mark.parametrize("usm_type", ["shared", "host", "device"]) diff --git a/dpctl/tests/test_usm_ndarray_manipulation.py b/dpctl/tests/test_usm_ndarray_manipulation.py index 2126727d5b..f3704274d4 100644 --- a/dpctl/tests/test_usm_ndarray_manipulation.py +++ b/dpctl/tests/test_usm_ndarray_manipulation.py @@ -1170,6 +1170,12 @@ def test_repeat_axes(): res = dpt.repeat(x, reps, axis=1) assert dpt.all(res == expected_res) + x = dpt.arange(10, dtype="i4") + expected_res = dpt.empty(x.shape[0] * reps, x.dtype) + expected_res[::2], expected_res[1::2] = x, x + res = dpt.repeat(x, reps, axis=0) + assert dpt.all(res == expected_res) + def test_repeat_size_0_outputs(): get_queue_or_skip() @@ -1193,11 +1199,17 @@ def test_repeat_size_0_outputs(): assert res.size == 0 assert res.shape == (3, 0, 5) - x = dpt.ones((3, 2, 5)) res = dpt.repeat(x, (0, 0), axis=1) assert res.size == 0 assert res.shape == (3, 0, 5) + # axis=None cases + res = dpt.repeat(x, 0) + assert res.size == 0 + + res = dpt.repeat(x, (0,) * x.size) + assert res.size == 0 + def test_repeat_strides(): get_queue_or_skip() @@ -1220,6 +1232,17 @@ def test_repeat_strides(): res = dpt.repeat(x1, (reps,) * x1.shape[0], axis=0) assert dpt.all(res == expected_res) + # axis=None + x = dpt.reshape(dpt.arange(10 * 10), (10, 10)) + x1 = dpt.reshape(x[::-2, :], -1) + x2 = x[::-2, :] + expected_res = dpt.empty(10 * 10, dtype="i4") + expected_res[::2], expected_res[1::2] = x1, x1 + res = dpt.repeat(x2, reps) + assert dpt.all(res == expected_res) + res = dpt.repeat(x2, (reps,) * x1.size) + assert dpt.all(res == expected_res) + def test_repeat_casting(): get_queue_or_skip() @@ -1256,11 +1279,6 @@ def test_repeat_arg_validation(): with pytest.raises(ValueError): dpt.repeat(x, 2, axis=1) - # x.ndim cannot be > 1 for axis=None - x = dpt.empty((5, 10)) - with pytest.raises(ValueError): - dpt.repeat(x, 2, axis=None) - # repeats must be positive x = dpt.empty(5) with pytest.raises(ValueError): diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py new file mode 100644 index 0000000000..0969822e6d --- /dev/null +++ b/dpctl/tests/test_usm_ndarray_reductions.py @@ -0,0 +1,499 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from random import randrange + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +_no_complex_dtypes = [ + "?", + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", +] + + +_all_dtypes = _no_complex_dtypes + [ + "c8", + "c16", +] + + +def test_max_min_axis(): + get_queue_or_skip() + + x = dpt.reshape( + dpt.arange((3 * 4 * 5 * 6 * 7), dtype="i4"), (3, 4, 5, 6, 7) + ) + + m = dpt.max(x, axis=(1, 2, -1)) + assert m.shape == (3, 6) + assert dpt.all(m == x[:, -1, -1, :, -1]) + + m = dpt.min(x, axis=(1, 2, -1)) + assert m.shape == (3, 6) + assert dpt.all(m == x[:, 0, 0, :, 0]) + + +def test_max_axis1_axis0(): + """See gh-1455""" + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(3 * 4 * 5), (3, 4, 5)) + + m = dpt.max(x, axis=0) + assert dpt.all(m == x[-1, :, :]) + + x = dpt.flip(x, axis=2) + m = dpt.max(x, axis=2) + assert dpt.all(m == x[:, :, 0]) + + +def test_reduction_keepdims(): + get_queue_or_skip() + + n0, n1 = 3, 6 + x = dpt.ones((n0, 4, 5, n1, 7), dtype="i4") + m = dpt.max(x, axis=(1, 2, -1), keepdims=True) + + xx = dpt.reshape(dpt.permute_dims(x, (0, 3, 1, 2, -1)), (n0, n1, -1)) + p = dpt.argmax(xx, axis=-1, keepdims=True) + + assert m.shape == (n0, 1, 1, n1, 1) + assert dpt.all(m == dpt.reshape(x[:, 0, 0, :, 0], m.shape)) + assert dpt.all(p == 0) + + +def test_max_scalar(): + get_queue_or_skip() + + x = dpt.ones(()) + m = dpt.max(x) + + assert m.shape == () + assert x == m + + +@pytest.mark.parametrize("arg_dtype", ["i4", "f4", "c8"]) +def test_reduction_kernels(arg_dtype): + # i4 - always uses atomics w/ sycl group reduction + # f4 - always uses atomics w/ custom group reduction + # c8 - always uses temps w/ custom group reduction + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + x = dpt.ones((24, 1025), dtype=arg_dtype, sycl_queue=q) + x[x.shape[0] // 2, :] = 3 + x[:, x.shape[1] // 2] = 3 + + m = dpt.max(x) + assert m == 3 + m = dpt.max(x, axis=0) + assert dpt.all(m == 3) + m = dpt.max(x, axis=1) + assert dpt.all(m == 3) + + x = dpt.ones((24, 1025), dtype=arg_dtype, sycl_queue=q) + x[x.shape[0] // 2, :] = 0 + x[:, x.shape[1] // 2] = 0 + + m = dpt.min(x) + assert m == 0 + m = dpt.min(x, axis=0) + assert dpt.all(m == 0) + m = dpt.min(x, axis=1) + assert dpt.all(m == 0) + + +def test_max_min_nan_propagation(): + get_queue_or_skip() + + # float, finites + x = dpt.arange(4, dtype="f4") + x[0] = dpt.nan + assert dpt.isnan(dpt.max(x)) + assert dpt.isnan(dpt.min(x)) + + # float, infinities + x[1:] = dpt.inf + assert dpt.isnan(dpt.max(x)) + x[1:] = -dpt.inf + assert dpt.isnan(dpt.min(x)) + + # complex + x = dpt.arange(4, dtype="c8") + x[0] = complex(dpt.nan, 0) + assert dpt.isnan(dpt.max(x)) + assert dpt.isnan(dpt.min(x)) + + x[0] = complex(0, dpt.nan) + assert dpt.isnan(dpt.max(x)) + assert dpt.isnan(dpt.min(x)) + + +def test_argmax_scalar(): + get_queue_or_skip() + + x = dpt.ones(()) + m = dpt.argmax(x) + + assert m.shape == () + assert m == 0 + + +@pytest.mark.parametrize("arg_dtype", ["i4", "f4", "c8"]) +def test_search_reduction_kernels(arg_dtype): + # i4 - always uses atomics w/ sycl group reduction + # f4 - always uses atomics w/ custom group reduction + # c8 - always uses temps w/ custom group reduction + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + x_shape = (24, 1024) + x_size = np.prod(x_shape) + x = dpt.ones(x_size, dtype=arg_dtype, sycl_queue=q) + idx = randrange(x.size) + idx_tup = np.unravel_index(idx, x_shape) + x[idx] = 2 + + m = dpt.argmax(x) + assert m == idx + + # test case of strided input mapping to contig + # implementation + m = dpt.argmax(dpt.flip(x)) + assert m == x.size - 1 - idx + + # test case of strided implementation + y = dpt.ones(2 * x.size, dtype=arg_dtype, sycl_queue=q) + y[::2] = x + m = dpt.argmax(y) + assert m == 2 * idx + + x = dpt.reshape(x, x_shape) + + x[idx_tup[0], :] = 3 + m = dpt.argmax(x, axis=0) + assert dpt.all(m == idx_tup[0]) + x[:, idx_tup[1]] = 4 + m = dpt.argmax(x, axis=1) + assert dpt.all(m == idx_tup[1]) + + x = x[:, ::-2] + idx = randrange(x.shape[1]) + x[:, idx] = 5 + m = dpt.argmax(x, axis=1) + assert dpt.all(m == idx) + + x = dpt.ones(x_size, dtype=arg_dtype, sycl_queue=q) + idx = randrange(x.size) + idx_tup = np.unravel_index(idx, x_shape) + x[idx] = 0 + + m = dpt.argmin(x) + assert m == idx + + x = dpt.reshape(x, x_shape) + + x[idx_tup[0], :] = -1 + m = dpt.argmin(x, axis=0) + assert dpt.all(m == idx_tup[0]) + x[:, idx_tup[1]] = -2 + m = dpt.argmin(x, axis=1) + assert dpt.all(m == idx_tup[1]) + + x = x[:, ::-2] + idx = randrange(x.shape[1]) + x[:, idx] = -3 + m = dpt.argmin(x, axis=1) + assert dpt.all(m == idx) + + +def test_argmax_argmin_nan_propagation(): + get_queue_or_skip() + + sz = 4 + idx = randrange(sz) + # floats + x = dpt.arange(sz, dtype="f4") + x[idx] = dpt.nan + assert dpt.argmax(x) == idx + assert dpt.argmin(x) == idx + + # complex + x = dpt.arange(sz, dtype="c8") + x[idx] = complex(dpt.nan, 0) + assert dpt.argmax(x) == idx + assert dpt.argmin(x) == idx + + x[idx] = complex(0, dpt.nan) + assert dpt.argmax(x) == idx + assert dpt.argmin(x) == idx + + +def test_argmax_argmin_identities(): + # make sure that identity arrays work as expected + get_queue_or_skip() + + x = dpt.full(3, dpt.iinfo(dpt.int32).min, dtype="i4") + assert dpt.argmax(x) == 0 + x = dpt.full(3, dpt.iinfo(dpt.int32).max, dtype="i4") + assert dpt.argmin(x) == 0 + + +@pytest.mark.parametrize("order", ["C", "F"]) +def test_argmax_axis0_axis1(order): + get_queue_or_skip() + + x = dpt.asarray([[1, 2, 3], [6, 5, 4]], dtype="i4", order=order) + assert dpt.argmax(x) == 3 + + res = dpt.argmax(x, axis=0) + expected = dpt.asarray([1, 1, 1], dtype=res.dtype) + assert dpt.all(res == expected) + + res = dpt.argmax(x, axis=1) + expected = dpt.asarray([2, 0], dtype=res.dtype) + assert dpt.all(res == expected) + + +def test_reduction_arg_validation(): + get_queue_or_skip() + + x = dict() + with pytest.raises(TypeError): + dpt.sum(x) + with pytest.raises(TypeError): + dpt.max(x) + with pytest.raises(TypeError): + dpt.argmax(x) + + x = dpt.zeros((0,), dtype="i4") + with pytest.raises(ValueError): + dpt.max(x) + with pytest.raises(ValueError): + dpt.argmax(x) + + +@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:]) +def test_logsumexp_arg_dtype_default_output_dtype_matrix(arg_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.logsumexp(m) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype.kind == "f" + tol = dpt.finfo(r.dtype).resolution + assert_allclose( + dpt.asnumpy(r), + np.logaddexp.reduce(dpt.asnumpy(m), dtype=r.dtype), + rtol=tol, + atol=tol, + ) + + +def test_logsumexp_empty(): + get_queue_or_skip() + x = dpt.empty((0,), dtype="f4") + y = dpt.logsumexp(x) + assert y.shape == tuple() + assert y == -dpt.inf + + +def test_logsumexp_axis(): + get_queue_or_skip() + + m = dpt.ones((3, 4, 5, 6, 7), dtype="f4") + s = dpt.logsumexp(m, axis=(1, 2, -1)) + + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 6) + tol = dpt.finfo(s.dtype).resolution + assert_allclose( + dpt.asnumpy(s), + np.logaddexp.reduce(dpt.asnumpy(m), axis=(1, 2, -1), dtype=s.dtype), + rtol=tol, + atol=tol, + ) + + +@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:]) +@pytest.mark.parametrize("out_dtype", _all_dtypes[1:]) +def test_logsumexp_arg_out_dtype_matrix(arg_dtype, out_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + skip_if_dtype_not_supported(out_dtype, q) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.logsumexp(m, dtype=out_dtype) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == dpt.dtype(out_dtype) + + +def test_logsumexp_keepdims(): + get_queue_or_skip() + + m = dpt.ones((3, 4, 5, 6, 7), dtype="i4") + s = dpt.logsumexp(m, axis=(1, 2, -1), keepdims=True) + + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 1, 1, 6, 1) + + +def test_logsumexp_keepdims_zero_size(): + get_queue_or_skip() + n = 10 + a = dpt.ones((n, 0, n)) + + s1 = dpt.logsumexp(a, keepdims=True) + assert s1.shape == (1, 1, 1) + + s2 = dpt.logsumexp(a, axis=(0, 1), keepdims=True) + assert s2.shape == (1, 1, n) + + s3 = dpt.logsumexp(a, axis=(1, 2), keepdims=True) + assert s3.shape == (n, 1, 1) + + s4 = dpt.logsumexp(a, axis=(0, 2), keepdims=True) + assert s4.shape == (1, 0, 1) + + a0 = a[0] + s5 = dpt.logsumexp(a0, keepdims=True) + assert s5.shape == (1, 1) + + +def test_logsumexp_scalar(): + get_queue_or_skip() + + m = dpt.ones(()) + s = dpt.logsumexp(m) + + assert isinstance(s, dpt.usm_ndarray) + assert m.sycl_queue == s.sycl_queue + assert s.shape == () + + +def test_logsumexp_complex(): + get_queue_or_skip() + + x = dpt.zeros(1, dtype="c8") + with pytest.raises(TypeError): + dpt.logsumexp(x) + + +def test_logsumexp_int_axis(): + get_queue_or_skip() + + x = dpt.zeros((8, 10), dtype="f4") + res = dpt.logsumexp(x, axis=0) + assert res.ndim == 1 + assert res.shape[0] == 10 + + +def test_logsumexp_invalid_arr(): + x = dict() + with pytest.raises(TypeError): + dpt.logsumexp(x) + + +@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:]) +def test_hypot_arg_dtype_default_output_dtype_matrix(arg_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.reduce_hypot(m) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype.kind == "f" + tol = dpt.finfo(r.dtype).resolution + assert_allclose( + dpt.asnumpy(r), + np.hypot.reduce(dpt.asnumpy(m), dtype=r.dtype), + rtol=tol, + atol=tol, + ) + + +def test_hypot_empty(): + get_queue_or_skip() + x = dpt.empty((0,), dtype="f4") + y = dpt.reduce_hypot(x) + assert y.shape == tuple() + assert y == 0 + + +@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:]) +@pytest.mark.parametrize("out_dtype", _all_dtypes[1:]) +def test_hypot_arg_out_dtype_matrix(arg_dtype, out_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + skip_if_dtype_not_supported(out_dtype, q) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.reduce_hypot(m, dtype=out_dtype) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == dpt.dtype(out_dtype) + + +def test_hypot_complex(): + get_queue_or_skip() + + x = dpt.zeros(1, dtype="c8") + with pytest.raises(TypeError): + dpt.reduce_hypot(x) + + +def test_tree_reduction_axis1_axis0(): + """See gh-1455""" + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5)) + + m = dpt.logsumexp(x, axis=0) + tol = dpt.finfo(m.dtype).resolution + assert_allclose( + dpt.asnumpy(m), + np.logaddexp.reduce(dpt.asnumpy(x), axis=0, dtype=m.dtype), + rtol=tol, + atol=tol, + ) + + x = dpt.flip(x, axis=2) + m = dpt.logsumexp(x, axis=2) + assert_allclose( + dpt.asnumpy(m), + np.logaddexp.reduce(dpt.asnumpy(x), axis=2, dtype=m.dtype), + rtol=tol, + atol=tol, + ) diff --git a/dpctl/tests/test_utils.py b/dpctl/tests/test_utils.py index df4a9f503f..05b2dc7890 100644 --- a/dpctl/tests/test_utils.py +++ b/dpctl/tests/test_utils.py @@ -122,3 +122,27 @@ def test_onetrace_enabled(): with dpctl.utils.onetrace_enabled(): assert os.getenv(v_name, None) == "1" assert os.getenv(v_name, None) == v_v + + +def test_intel_device_info(): + try: + d = dpctl.select_default_device() + except dpctl.SyclDeviceCreationError: + pytest.skip("Default device could not be created") + descr = dpctl.utils.intel_device_info(d) + assert isinstance(descr, dict) + assert ("device_id" in descr) or not descr + allowed_names = [ + "device_id", + "gpu_slices", + "gpu_eu_count", + "gpu_eu_simd_width", + "gpu_hw_threads_per_eu", + "gpu_subslices_per_slice", + "gpu_eu_count_per_subslice", + "max_mem_bandwidth", + ] + for descriptor_name in descr.keys(): + test = descriptor_name in allowed_names + err_msg = f"Key '{descriptor_name}' is not recognized" + assert test, err_msg diff --git a/dpctl/utils/CMakeLists.txt b/dpctl/utils/CMakeLists.txt index 11b0930052..aadc1c0fe0 100644 --- a/dpctl/utils/CMakeLists.txt +++ b/dpctl/utils/CMakeLists.txt @@ -4,3 +4,26 @@ foreach(_cy_file ${_cython_sources}) get_filename_component(_trgt ${_cy_file} NAME_WLE) build_dpctl_ext(${_trgt} ${_cy_file} "dpctl/utils") endforeach() + +add_custom_target(_dpctl4pybind11_header_ready + DEPENDS + _usmarray_copy_capi_include + _memory_copy_capi_include + _sycl_device_copy_capi_include + _sycl_queue_copy_capi_include + _sycl_context_copy_capi_include + _sycl_event_copy_capi_include +) + +set(python_module_name _device_queries) +set(_module_src ${CMAKE_CURRENT_SOURCE_DIR}/src/device_queries.cpp) +pybind11_add_module(${python_module_name} MODULE + ${_module_src} +) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src}) +target_include_directories(${python_module_name} + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../include +) +add_dependencies(${python_module_name} _dpctl4pybind11_header_ready) +install(TARGETS ${python_module_name} DESTINATION "dpctl/utils") diff --git a/dpctl/utils/__init__.py b/dpctl/utils/__init__.py index 671564cda5..fb41b3b74c 100644 --- a/dpctl/utils/__init__.py +++ b/dpctl/utils/__init__.py @@ -18,18 +18,85 @@ A collection of utility functions. """ +from .._sycl_device import SyclDevice from ._compute_follows_data import ( ExecutionPlacementError, get_coerced_usm_type, get_execution_queue, validate_usm_type, ) +from ._device_queries import ( + intel_device_info_device_id, + intel_device_info_gpu_eu_count, + intel_device_info_gpu_eu_count_per_subslice, + intel_device_info_gpu_eu_simd_width, + intel_device_info_gpu_hw_threads_per_eu, + intel_device_info_gpu_slices, + intel_device_info_gpu_subslices_per_slice, + intel_device_info_max_mem_bandwidth, +) from ._onetrace_context import onetrace_enabled + +def intel_device_info(dev): + """intel_device_info(sycl_device) + + For Intel(R) GPU devices returns a dictionary + with device architectural details, and an empty + dictionary otherwise. The dictionary contains + the following keys: + + device_id: 32-bits device PCI identifier + gpu_eu_count: Total number of execution units + gpu_hw_threads_per_eu: Number of thread contexts in EU + gpu_eu_simd_width: Physical SIMD width of EU + gpu_slices: Total number of slices + gpu_subslices_per_slice: Number of sub-slices per slice + gpu_eu_count_per_subslice: Number of EUs in subslice + max_mem_bandwidth: Maximum memory bandwidth in bytes/second + + Unsupported descriptors are omitted from the dictionary. + Descriptors other than PCI identifier are supported only for + SyclDevices with Leve-Zero backend. + """ + if not isinstance(dev, SyclDevice): + raise TypeError(f"Expected dpctl.SyclDevice, got {type(dev)}") + dev_id = intel_device_info_device_id(dev) + if dev_id: + res = { + "device_id": dev_id, + } + if dev.has_aspect_gpu: + eu_count = intel_device_info_gpu_eu_count(dev) + if eu_count: + res["gpu_eu_count"] = eu_count + hw_threads = intel_device_info_gpu_hw_threads_per_eu(dev) + if hw_threads: + res["gpu_hw_threads_per_eu"] = hw_threads + simd_w = intel_device_info_gpu_eu_simd_width(dev) + if simd_w: + res["gpu_eu_simd_width"] = simd_w + n_slices = intel_device_info_gpu_slices(dev) + if n_slices: + res["gpu_slices"] = n_slices + n_subslices = intel_device_info_gpu_subslices_per_slice(dev) + if n_subslices: + res["gpu_subslices_per_slice"] = n_subslices + n_eu_per_subslice = intel_device_info_gpu_eu_count_per_subslice(dev) + if n_eu_per_subslice: + res["gpu_eu_count_per_subslice"] = n_eu_per_subslice + bw = intel_device_info_max_mem_bandwidth(dev) + if bw: + res["max_mem_bandwidth"] = bw + return res + return dict() + + __all__ = [ "get_execution_queue", "get_coerced_usm_type", "validate_usm_type", "onetrace_enabled", + "intel_device_info", "ExecutionPlacementError", ] diff --git a/dpctl/utils/src/device_queries.cpp b/dpctl/utils/src/device_queries.cpp new file mode 100644 index 0000000000..6407e69dbb --- /dev/null +++ b/dpctl/utils/src/device_queries.cpp @@ -0,0 +1,139 @@ +#include "dpctl4pybind11.hpp" +#include +#include +#include + +#include +#include + +namespace +{ + +std::uint32_t py_intel_device_id(const sycl::device &d) +{ + static constexpr std::uint32_t device_id_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_device_id)) { + return d.get_info(); + } + + return device_id_unavailable; +} + +std::uint32_t py_intel_gpu_eu_count(const sycl::device &d) +{ + static constexpr std::uint32_t eu_count_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_gpu_eu_count)) { + return d.get_info(); + } + + return eu_count_unavailable; +} + +std::uint32_t py_intel_gpu_hw_threads_per_eu(const sycl::device &d) +{ + static constexpr std::uint32_t thread_count_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) { + return d + .get_info(); + } + + return thread_count_unavailable; +} + +std::uint32_t py_intel_gpu_eu_simd_width(const sycl::device &d) +{ + static constexpr std::uint32_t width_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_gpu_eu_simd_width)) { + return d.get_info(); + } + + return width_unavailable; +} + +std::uint32_t py_intel_gpu_slices(const sycl::device &d) +{ + static constexpr std::uint32_t count_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_gpu_slices)) { + return d.get_info(); + } + + return count_unavailable; +} + +std::uint32_t py_intel_gpu_subslices_per_slice(const sycl::device &d) +{ + static constexpr std::uint32_t count_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_gpu_subslices_per_slice)) { + return d.get_info< + sycl::ext::intel::info::device::gpu_subslices_per_slice>(); + } + + return count_unavailable; +} + +std::uint32_t py_intel_gpu_eu_count_per_subslice(const sycl::device &d) +{ + static constexpr std::uint32_t count_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_gpu_eu_count_per_subslice)) { + return d.get_info< + sycl::ext::intel::info::device::gpu_eu_count_per_subslice>(); + } + + return count_unavailable; +} + +std::uint64_t py_intel_max_mem_bandwidth(const sycl::device &d) +{ + static constexpr std::uint64_t bandwidth_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_max_mem_bandwidth)) { + return d.get_info(); + } + + return bandwidth_unavailable; +} + +}; // namespace + +PYBIND11_MODULE(_device_queries, m) +{ + m.def("intel_device_info_device_id", &py_intel_device_id, + "Get ext_intel_device_id for the device, zero if not an intel device", + py::arg("device")); + + m.def("intel_device_info_gpu_eu_count", &py_intel_gpu_eu_count, + "Returns the number of execution units (EUs) associated with the " + "Intel GPU.", + py::arg("device")); + + m.def("intel_device_info_gpu_hw_threads_per_eu", + &py_intel_gpu_hw_threads_per_eu, + "Returns the number of hardware threads in EU.", py::arg("device")); + + m.def("intel_device_info_gpu_eu_simd_width", &py_intel_gpu_eu_simd_width, + "Returns the physical SIMD width of the execution unit (EU).", + py::arg("device")); + + m.def("intel_device_info_gpu_slices", &py_intel_gpu_slices, + "Returns the number of slices in the GPU device, or zero.", + py::arg("device")); + + m.def("intel_device_info_gpu_subslices_per_slice", + &py_intel_gpu_subslices_per_slice, + "Returns the number of subslices per slice.", py::arg("device")); + + m.def("intel_device_info_gpu_eu_count_per_subslice", + &py_intel_gpu_eu_count_per_subslice, + "Returns the number of EUs per subslice of GPU.", py::arg("device")); + + m.def("intel_device_info_max_mem_bandwidth", &py_intel_max_mem_bandwidth, + "Returns the maximum memory bandwidth in units of bytes/second.", + py::arg("device")); +} diff --git a/examples/pybind11/external_usm_allocation/CMakeLists.txt b/examples/pybind11/external_usm_allocation/CMakeLists.txt index ce231fad4a..c8679ab73a 100644 --- a/examples/pybind11/external_usm_allocation/CMakeLists.txt +++ b/examples/pybind11/external_usm_allocation/CMakeLists.txt @@ -1,10 +1,11 @@ -cmake_minimum_required(VERSION 3.21) +cmake_minimum_required(VERSION 3.21...3.27 FATAL_ERROR) -project(external_usm_allocation LANGUAGES CXX) +project(external_usm_allocation VERSION 0.1 LANGUAGES CXX + DESCRIPTION "Example of passing external C++ USM allocation to Python") set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH}) -find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) +find_package(IntelSYCL REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) @@ -13,20 +14,23 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) include(FetchContent) FetchContent_Declare( pybind11 - URL https://github.com/pybind/pybind11/archive/refs/tags/v2.10.2.tar.gz - URL_HASH SHA256=93bd1e625e43e03028a3ea7389bba5d3f9f2596abc074b068e70f4ef9b1314ae + URL https://github.com/pybind/pybind11/archive/refs/tags/v2.11.1.tar.gz + URL_HASH SHA256=d475978da0cdc2d43b73f30910786759d593a9d8ee05b1b6846d1eb16c6d2e0c ) FetchContent_MakeAvailable(pybind11) -find_package(PythonExtensions REQUIRED) +find_package(Python REQUIRED COMPONENTS Development.Module NumPy) find_package(Dpctl REQUIRED) -find_package(NumPy REQUIRED) set(py_module_name _external_usm_alloc) +set(_sources + external_usm_allocation/_usm_alloc_example.cpp +) pybind11_add_module(${py_module_name} MODULE - external_usm_allocation/_usm_alloc_example.cpp + ${_sources} ) +add_sycl_to_target(TARGET ${py_module_name} SOURCES ${_sources}) target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS}) install(TARGETS ${py_module_name} DESTINATION external_usm_allocation diff --git a/examples/pybind11/onemkl_gemv/CMakeLists.txt b/examples/pybind11/onemkl_gemv/CMakeLists.txt index 25589e4202..eb70b22982 100644 --- a/examples/pybind11/onemkl_gemv/CMakeLists.txt +++ b/examples/pybind11/onemkl_gemv/CMakeLists.txt @@ -1,9 +1,10 @@ -cmake_minimum_required(VERSION 3.22 FATAL_ERROR) +cmake_minimum_required(VERSION 3.22...3.27 FATAL_ERROR) -project(example_use_mkl_gemm LANGUAGES CXX) +project(example_use_mkl_gemm VERSION 0.1 LANGUAGES CXX + DESCRIPTION "Example of using Python wrapper to oneMKL function") set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH}) -find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) +find_package(IntelSYCL REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}") @@ -17,12 +18,12 @@ include(GNUInstallDirs) include(FetchContent) FetchContent_Declare( pybind11 - URL https://github.com/pybind/pybind11/archive/refs/tags/v2.10.2.tar.gz - URL_HASH SHA256=93bd1e625e43e03028a3ea7389bba5d3f9f2596abc074b068e70f4ef9b1314ae + URL https://github.com/pybind/pybind11/archive/refs/tags/v2.11.1.tar.gz + URL_HASH SHA256=d475978da0cdc2d43b73f30910786759d593a9d8ee05b1b6846d1eb16c6d2e0c ) FetchContent_MakeAvailable(pybind11) -find_package(PythonExtensions REQUIRED) +find_package(Python REQUIRED COMPONENTS Development.Module NumPy) find_package(Dpctl REQUIRED) find_library(mkl_core NAMES mkl_core PATHS ${MKL_LIBRARY_DIR} REQUIRED) @@ -34,10 +35,12 @@ find_library(OpenCL NAMES OpenCL REQUIRED) set(py_module_name _onemkl) +set(_sources sycl_gemm/_onemkl.cpp) pybind11_add_module(${py_module_name} MODULE - sycl_gemm/_onemkl.cpp + ${_sources} ) +add_sycl_to_target(TARGET ${py_module_name} SOURCES ${_sources}) target_compile_definitions(${py_module_name} PRIVATE -DMKL_ILP64) target_include_directories(${py_module_name} PUBLIC ${MKL_INCLUDE_DIR} sycl_gemm @@ -49,11 +52,14 @@ target_link_libraries(${py_module_name} install(TARGETS ${py_module_name} DESTINATION sycl_gemm) target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS}) -get_target_property(_sycl_gemm_sources ${py_module_name} SOURCES) -set_source_files_properties(${_sycl_gemm_sources} - PROPERTIES - COMPILE_OPTIONS "-O3" -) +foreach(_src_fn ${_sources}) + get_source_file_property(_compile_options ${_src_fn} COMPILE_OPTIONS) + set(_combined_options ${_compile_options} "-O3") + set_source_files_properties(${_src_fn} + PROPERTIES + COMPILE_OPTIONS "${_combined_options}" + ) +endforeach() target_link_options(${py_module_name} PRIVATE -fsycl-device-code-split=per_kernel) add_executable(standalone_cpp diff --git a/examples/pybind11/use_dpctl_sycl_kernel/CMakeLists.txt b/examples/pybind11/use_dpctl_sycl_kernel/CMakeLists.txt index f246d29924..ec33b2e153 100644 --- a/examples/pybind11/use_dpctl_sycl_kernel/CMakeLists.txt +++ b/examples/pybind11/use_dpctl_sycl_kernel/CMakeLists.txt @@ -1,10 +1,11 @@ -cmake_minimum_required(VERSION 3.21) +cmake_minimum_required(VERSION 3.21...3.27 FATAL_ERROR) -project(use_queue_device LANGUAGES CXX) +project(use_queue_device VERSION 0.1 LANGUAGES CXX + DESCRIPTION "Example of using dpctl.program.SyclKernel <-> sycl::kernel type casting") set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH}) -find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) +find_package(IntelSYCL REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) @@ -19,15 +20,16 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(pybind11) -find_package(PythonExtensions REQUIRED) +find_package(Python REQUIRED COMPONENTS Development.Module NumPy) find_package(Dpctl REQUIRED) -find_package(NumPy REQUIRED) set(py_module_name _use_kernel) +set(_sources use_kernel/_example.cpp) pybind11_add_module(${py_module_name} MODULE - use_kernel/_example.cpp + ${_sources} ) +add_sycl_to_target(TARGET ${py_module_name} SOURCES ${_sources}) target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS}) install(TARGETS ${py_module_name} DESTINATION use_kernel diff --git a/examples/pybind11/use_dpctl_sycl_queue/CMakeLists.txt b/examples/pybind11/use_dpctl_sycl_queue/CMakeLists.txt index f7b843d7f5..827388fae1 100644 --- a/examples/pybind11/use_dpctl_sycl_queue/CMakeLists.txt +++ b/examples/pybind11/use_dpctl_sycl_queue/CMakeLists.txt @@ -1,10 +1,11 @@ -cmake_minimum_required(VERSION 3.21) +cmake_minimum_required(VERSION 3.21...3.27 FATAL_ERROR) -project(use_queue_device LANGUAGES CXX) +project(use_queue_device VERSION 0.1 LANGUAGES CXX + DESCRIPTION "Example of using dpctl.SyclQueue <-> sycl::queue type caster") set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH}) -find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) +find_package(IntelSYCL REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) @@ -13,20 +14,21 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) include(FetchContent) FetchContent_Declare( pybind11 - URL https://github.com/pybind/pybind11/archive/refs/tags/v2.10.2.tar.gz - URL_HASH SHA256=93bd1e625e43e03028a3ea7389bba5d3f9f2596abc074b068e70f4ef9b1314ae + URL https://github.com/pybind/pybind11/archive/refs/tags/v2.11.1.tar.gz + URL_HASH SHA256=d475978da0cdc2d43b73f30910786759d593a9d8ee05b1b6846d1eb16c6d2e0c ) FetchContent_MakeAvailable(pybind11) -find_package(PythonExtensions REQUIRED) +find_package(Python REQUIRED COMPONENTS Development.Module NumPy) find_package(Dpctl REQUIRED) -find_package(NumPy REQUIRED) set(py_module_name _use_queue_device) +set(_sources use_queue_device/_example.cpp) pybind11_add_module(${py_module_name} MODULE - use_queue_device/_example.cpp + ${_sources} ) +add_sycl_to_target(TARGET ${py_module_name} SOURCES ${_sources}) target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS}) install(TARGETS ${py_module_name} DESTINATION use_queue_device diff --git a/examples/python/sycl_timer.py b/examples/python/sycl_timer.py index f4b1416784..8ae49fd60d 100644 --- a/examples/python/sycl_timer.py +++ b/examples/python/sycl_timer.py @@ -15,14 +15,27 @@ # limitations under the License. -import dpnp import numpy as np import dpctl import dpctl.tensor as dpt from dpctl import SyclTimer -n = 4000 + +def matmul(m1, m2): + """Naive matrix multiplication implementation""" + assert m1.ndim == 2 + assert m2.ndim == 2 + assert m1.shape[1] == m2.shape[0] + m1 = m1[:, dpt.newaxis, :] + m2 = dpt.permute_dims(m2, (1, 0))[dpt.newaxis, :, :] + # form m_prod[i, j, k] = m1[i,k] * m2[k, j] + m_prods = m1 * m2 + # sum over k + return dpt.sum(m_prods, axis=-1) + + +n = 500 try: q = dpctl.SyclQueue(property="enable_profiling") @@ -33,32 +46,36 @@ ) exit(0) -a = dpt.reshape(dpt.arange(n * n, dtype=np.float32, sycl_queue=q), (n, n)) -b = dpt.reshape( - dpt.asarray(np.random.random(n * n), dtype=np.float32, sycl_queue=q), (n, n) -) +a_flat = dpt.arange(n * n, dtype=dpt.float32, sycl_queue=q) +a = dpt.reshape(a_flat, (n, n)) -timer = SyclTimer(time_scale=1) +b_rand = np.random.random(n * n).astype(np.float32) +b_flat = dpt.asarray(b_rand, dtype=dpt.float32, sycl_queue=q) +b = dpt.reshape(b_flat, (n, n)) wall_times = [] device_times = [] + print( - f"Performing matrix multiplication of two {n} by {n} matrices " + f"Computing naive matrix multiplication of two {n} by {n} matrices " f"on {q.sycl_device.name}, repeating 5 times." ) +print() for _ in range(5): + timer = SyclTimer(time_scale=1) with timer(q): - a_matmul_b = dpnp.matmul(a, b) + a_matmul_b = matmul(a, b) host_time, device_time = timer.dt wall_times.append(host_time) device_times.append(device_time) -c = dpnp.asnumpy(a_matmul_b) -cc = np.dot(dpnp.asnumpy(a), dpnp.asnumpy(b)) +c = dpt.asnumpy(a_matmul_b) +cc = np.dot(dpt.asnumpy(a), dpt.asnumpy(b)) print("Wall time: ", wall_times, "\nDevice time: ", device_times) +print() print( "Accuracy test: passed." if np.allclose(c, cc) - else (f"Accuracy test: failed. Discrepancy {np.max(np.abs(c-cc))}") + else (f"Accuracy test: FAILED. \n Discrepancy = {np.max(np.abs(c-cc))}") ) diff --git a/libsyclinterface/CMakeLists.txt b/libsyclinterface/CMakeLists.txt index 01b0321064..64ec3271b1 100644 --- a/libsyclinterface/CMakeLists.txt +++ b/libsyclinterface/CMakeLists.txt @@ -11,8 +11,8 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake/mo find_package(Git REQUIRED) -if(NOT DEFINED IntelDPCPP_FOUND OR NOT IntelDPCPP_FOUND) - find_package(IntelDPCPP REQUIRED) +if(NOT DEFINED IntelSYCL_FOUND OR NOT IntelSYCL_FOUND) + find_package(IntelSYCL REQUIRED) endif() # Option to turn on support for creating Level Zero interoperability programs @@ -43,11 +43,10 @@ option(DPCTL_ENABLE_GLOG ) # Minimum version requirement only when oneAPI dpcpp is used. -find_package(IntelDPCPP REQUIRED) if(DPCTL_DPCPP_FROM_ONEAPI) - find_package(IntelSycl 2021.3.0 REQUIRED) + find_package(IntelSyclCompiler 2021.3.0 REQUIRED) else() - find_package(IntelSycl REQUIRED) + find_package(IntelSyclCompiler REQUIRED) endif() if(DPCTL_ENABLE_L0_PROGRAM_CREATION) @@ -57,7 +56,7 @@ if(DPCTL_ENABLE_L0_PROGRAM_CREATION) if (UNIX) find_library(PI_LEVEL_ZERO_LIB NAMES pi_level_zero - HINTS ${IntelSycl_LIBRARY_DIR} + HINTS ${IntelSyclCompiler_LIBRARY_DIR} ) find_program(READELF_PROG readelf) find_program(GREP_PROG grep) @@ -77,7 +76,7 @@ endif() if (UNIX) find_library(PI_OPENCL_LIB NAMES pi_opencl - HINTS ${IntelSycl_LIBRARY_DIR} + HINTS ${IntelSyclCompiler_LIBRARY_DIR} ) find_program(READELF_PROG readelf) find_program(GREP_PROG grep) @@ -157,7 +156,6 @@ elseif(UNIX) string(CONCAT CXXFLAGS "${WARNING_FLAGS}" "${SDL_FLAGS}" - "-fsycl " ) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXFLAGS}") @@ -206,6 +204,7 @@ add_library(DPCTLSyclInterface ${sources} ${helper_sources} ) +add_sycl_to_target(TARGET DPCTLSyclInterface SOURCES ${sources} ${helper_sources}) if(DPCTL_GENERATE_COVERAGE) target_link_options(DPCTLSyclInterface @@ -222,9 +221,6 @@ target_include_directories(DPCTLSyclInterface ${CMAKE_CURRENT_SOURCE_DIR}/helper/include/ ${SYCL_INCLUDE_DIR} ) -target_link_libraries(DPCTLSyclInterface - PRIVATE ${IntelSycl_SYCL_LIBRARY} -) if(DPCTL_ENABLE_GLOG) find_package(glog REQUIRED) diff --git a/libsyclinterface/cmake/modules/FindIntelSycl.cmake b/libsyclinterface/cmake/modules/FindIntelSyclCompiler.cmake similarity index 64% rename from libsyclinterface/cmake/modules/FindIntelSycl.cmake rename to libsyclinterface/cmake/modules/FindIntelSyclCompiler.cmake index 84e8946fea..45bb4f583f 100644 --- a/libsyclinterface/cmake/modules/FindIntelSycl.cmake +++ b/libsyclinterface/cmake/modules/FindIntelSyclCompiler.cmake @@ -19,21 +19,23 @@ # # Example usage: # -# find_package(IntelSycl) +# find_package(IntelSyclCompiler) # # If successful, the following variables will be defined: -# IntelSycl_FOUND -# IntelSycl_VERSION -# IntelSycl_INCLUDE_DIR -# IntelSycl_C_COMPILER -# IntelSycl_CXX_COMPILER -# IntelSycl_SYCL_INCLUDE_DIR -# IntelSycl_LIBRARY_DIR -# IntelSycl_SYCL_LIBRARY -# IntelSycl_OPENCL_LIBRARY +# IntelSyclCompiler_FOUND +# IntelSyclCompiler_VERSION +# IntelSyclCompiler_INCLUDE_DIR +# IntelSyclCompiler_C_COMPILER +# IntelSyclCompiler_CXX_COMPILER +# IntelSyclCompiler_SYCL_INCLUDE_DIR +# IntelSyclCompiler_LIBRARY_DIR +# IntelSyclCompiler_SYCL_LIBRARY +# IntelSyclCompiler_OPENCL_LIBRARY include(FindPackageHandleStandardArgs) -find_package(IntelDPCPP REQUIRED) +if(NOT DEFINED IntelSYCL_FOUND OR NOT IntelSYCL_FOUND) + find_package(IntelSYCL REQUIRED) +endif() # We will extract the version information from the compiler set(clangxx_cmd "${CMAKE_CXX_COMPILER}") @@ -91,78 +93,78 @@ execute_process( # If dpcpp is found then set the package variables if(${clangxx_result} MATCHES "0") - string(REPLACE "\n" ";" IntelSycl_VERSION_LIST "${clangxx_ver}") + string(REPLACE "\n" ";" IntelSyclCompiler_VERSION_LIST "${clangxx_ver}") set(IDX 0) - foreach(X ${IntelSycl_VERSION_LIST}) + foreach(X ${IntelSyclCompiler_VERSION_LIST}) message(STATUS "dpcpp ver[${IDX}]: ${X}") MATH(EXPR IDX "${IDX}+1") endforeach() - list(GET IntelSycl_VERSION_LIST 0 VERSION_STRING) + list(GET IntelSyclCompiler_VERSION_LIST 0 VERSION_STRING) # Get the dpcpp version string(REGEX MATCH "[0-9]+\.[0-9]+\.[0-9]+" - IntelSycl_VERSION + IntelSyclCompiler_VERSION ${VERSION_STRING} ) # Split out the version into major, minor an patch - string(REPLACE "." ";" IntelSycl_VERSION_LIST1 "${IntelSycl_VERSION}") - list(GET IntelSycl_VERSION_LIST1 0 IntelSycl_VERSION_MAJOR) - list(GET IntelSycl_VERSION_LIST1 1 IntelSycl_VERSION_MINOR) - list(GET IntelSycl_VERSION_LIST1 2 IntelSycl_VERSION_PATCH) - set(IntelSycl_INCLUDE_DIR ${SYCL_INCLUDE_DIR}) - set(IntelSycl_SYCL_INCLUDE_DIR ${SYCL_INCLUDE_DIR}/sycl) - set(IntelSycl_LIBRARY_DIR ${SYCL_LIBRARY_DIR}) + string(REPLACE "." ";" IntelSyclCompiler_VERSION_LIST1 "${IntelSyclCompiler_VERSION}") + list(GET IntelSyclCompiler_VERSION_LIST1 0 IntelSyclCompiler_VERSION_MAJOR) + list(GET IntelSyclCompiler_VERSION_LIST1 1 IntelSyclCompiler_VERSION_MINOR) + list(GET IntelSyclCompiler_VERSION_LIST1 2 IntelSyclCompiler_VERSION_PATCH) + set(IntelSyclCompiler_INCLUDE_DIR ${SYCL_INCLUDE_DIR}) + set(IntelSyclCompiler_SYCL_INCLUDE_DIR ${SYCL_INCLUDE_DIR}/sycl) + set(IntelSyclCompiler_LIBRARY_DIR ${SYCL_LIBRARY_DIR}) if("x${CMAKE_SYSTEM_NAME}" STREQUAL "xWindows") find_file( - IntelSycl_SYCL_LIBRARY + IntelSyclCompiler_SYCL_LIBRARY NAMES "sycl.lib" "sycl6.lib" "sycl7.lib" - PATHS ${IntelSycl_LIBRARY_DIR} + PATHS ${IntelSyclCompiler_LIBRARY_DIR} ) find_file( - IntelSycl_OPENCL_LIBRARY + IntelSyclCompiler_OPENCL_LIBRARY NAMES "OpenCL.lib" - PATHS ${IntelSycl_LIBRARY_DIR} + PATHS ${IntelSyclCompiler_LIBRARY_DIR} ) elseif("x${CMAKE_SYSTEM_NAME}" STREQUAL "xLinux") find_file( - IntelSycl_SYCL_LIBRARY + IntelSyclCompiler_SYCL_LIBRARY NAMES "libsycl.so" - PATHS ${IntelSycl_LIBRARY_DIR} + PATHS ${IntelSyclCompiler_LIBRARY_DIR} ) find_file( - IntelSycl_OPENCL_LIBRARY + IntelSyclCompiler_OPENCL_LIBRARY NAMES "libOpenCL.so" - PATHS ${IntelSycl_LIBRARY_DIR} + PATHS ${IntelSyclCompiler_LIBRARY_DIR} ) endif() endif() # Check if a specific version of DPCPP is requested. -if(IntelSycl_FIND_VERSION AND (DEFINED IntelSycl_VERSION)) +if(IntelSyclCompiler_FIND_VERSION AND (DEFINED IntelSyclCompiler_VERSION)) set(VERSION_GT_FIND_VERSION FALSE) versions_greater_equal( - ${IntelSycl_VERSION} - ${IntelSycl_FIND_VERSION} + ${IntelSyclCompiler_VERSION} + ${IntelSyclCompiler_FIND_VERSION} VERSION_GT_FIND_VERSION ) if(VERSION_GT_FIND_VERSION) - set(IntelSycl_FOUND TRUE) + set(IntelSyclCompiler_FOUND TRUE) else() - set(IntelSycl_FOUND FALSE) + set(IntelSyclCompiler_FOUND FALSE) endif() else() - set(IntelSycl_FOUND TRUE) + set(IntelSyclCompiler_FOUND TRUE) endif() -find_package_handle_standard_args(IntelSycl DEFAULT_MSG - IntelSycl_FOUND - IntelSycl_VERSION - IntelSycl_INCLUDE_DIR - IntelSycl_SYCL_INCLUDE_DIR - IntelSycl_LIBRARY_DIR - IntelSycl_SYCL_LIBRARY - IntelSycl_OPENCL_LIBRARY +find_package_handle_standard_args(IntelSyclCompiler DEFAULT_MSG + IntelSyclCompiler_FOUND + IntelSyclCompiler_VERSION + IntelSyclCompiler_INCLUDE_DIR + IntelSyclCompiler_SYCL_INCLUDE_DIR + IntelSyclCompiler_LIBRARY_DIR + IntelSyclCompiler_SYCL_LIBRARY + IntelSyclCompiler_OPENCL_LIBRARY ) diff --git a/libsyclinterface/cmake/modules/GetProjectVersion.cmake b/libsyclinterface/cmake/modules/GetProjectVersion.cmake index c0f4ec4a6f..a863a4ee17 100644 --- a/libsyclinterface/cmake/modules/GetProjectVersion.cmake +++ b/libsyclinterface/cmake/modules/GetProjectVersion.cmake @@ -29,7 +29,7 @@ # VERSION_MINOR # VERSION # SEMVER -cmake_minimum_required( VERSION 3.14.0 ) +cmake_minimum_required(VERSION 3.14...3.27 FATAL_ERROR ) function(get_version) # Use git describe to get latest tag name diff --git a/libsyclinterface/include/Config/dpctl_config.h.in b/libsyclinterface/include/Config/dpctl_config.h.in index f26fc5591b..6e3daffbed 100644 --- a/libsyclinterface/include/Config/dpctl_config.h.in +++ b/libsyclinterface/include/Config/dpctl_config.h.in @@ -31,7 +31,7 @@ #define __SYCL_COMPILER_VERSION_REQUIRED 20221201L /* The DPCPP version used to build dpctl */ -#define DPCTL_DPCPP_VERSION "@IntelSycl_VERSION@" +#define DPCTL_DPCPP_VERSION "@IntelSyclCompiler_VERSION@" #define DPCTL_LIBZE_LOADER_FILENAME "@LIBZE_LOADER_FILENAME@" #define DPCTL_LIBCL_LOADER_FILENAME "@LIBCL_LOADER_FILENAME@" diff --git a/libsyclinterface/include/dpctl_sycl_queue_interface.h b/libsyclinterface/include/dpctl_sycl_queue_interface.h index 1c5e53a395..cc466fce17 100644 --- a/libsyclinterface/include/dpctl_sycl_queue_interface.h +++ b/libsyclinterface/include/dpctl_sycl_queue_interface.h @@ -294,6 +294,29 @@ DPCTLQueue_Memcpy(__dpctl_keep const DPCTLSyclQueueRef QRef, const void *Src, size_t Count); +/*! + * @brief C-API wrapper for ``sycl::queue::memcpy``. + * + * @param QRef An opaque pointer to the ``sycl::queue``. + * @param Dest An USM pointer to the destination memory. + * @param Src An USM pointer to the source memory. + * @param Count A number of bytes to copy. + * @param DepEvents A pointer to array of DPCTLSyclEventRef opaque + * pointers to dependent events. + * @param DepEventsCount A number of dependent events. + * @return An opaque pointer to the ``sycl::event`` returned by the + * ``sycl::queue::memcpy`` function. + * @ingroup QueueInterface + */ +DPCTL_API +__dpctl_give DPCTLSyclEventRef +DPCTLQueue_MemcpyWithEvents(__dpctl_keep const DPCTLSyclQueueRef QRef, + void *Dest, + const void *Src, + size_t Count, + __dpctl_keep const DPCTLSyclEventRef *DepEvents, + size_t DepEventsCount); + /*! * @brief C-API wrapper for ``sycl::queue::prefetch``. * diff --git a/libsyclinterface/source/dpctl_sycl_device_interface.cpp b/libsyclinterface/source/dpctl_sycl_device_interface.cpp index b5a97013c2..7a159a331c 100644 --- a/libsyclinterface/source/dpctl_sycl_device_interface.cpp +++ b/libsyclinterface/source/dpctl_sycl_device_interface.cpp @@ -543,6 +543,18 @@ DPCTLDevice_GetParentDevice(__dpctl_keep const DPCTLSyclDeviceRef DRef) { auto D = unwrap(DRef); if (D) { + bool is_unpartitioned = false; + try { + auto pp = + D->get_info(); + is_unpartitioned = + (pp == sycl::info::partition_property::no_partition); + } catch (std::exception const &e) { + error_handler(e, __FILE__, __func__, __LINE__); + return nullptr; + } + if (is_unpartitioned) + return nullptr; try { const auto &parent_D = D->get_info(); return wrap(new device(parent_D)); diff --git a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp index 4903b888ff..60098ae933 100644 --- a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp +++ b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp @@ -410,9 +410,12 @@ DPCTLQueue_SubmitNDRange(__dpctl_keep const DPCTLSyclKernelRef KRef, try { e = Queue->submit([&](handler &cgh) { // Depend on any event that was specified by the caller. - if (NDepEvents) - for (auto i = 0ul; i < NDepEvents; ++i) - cgh.depends_on(*unwrap(DepEvents[i])); + if (DepEvents) + for (auto i = 0ul; i < NDepEvents; ++i) { + auto ei = unwrap(DepEvents[i]); + if (ei) + cgh.depends_on(*ei); + } for (auto i = 0ul; i < NArgs; ++i) { // \todo add support for Sycl buffers @@ -485,6 +488,42 @@ DPCTLQueue_Memcpy(__dpctl_keep const DPCTLSyclQueueRef QRef, } } +__dpctl_give DPCTLSyclEventRef +DPCTLQueue_MemcpyWithEvents(__dpctl_keep const DPCTLSyclQueueRef QRef, + void *Dest, + const void *Src, + size_t Count, + const DPCTLSyclEventRef *DepEvents, + size_t DepEventsCount) +{ + event ev; + auto Q = unwrap(QRef); + if (Q) { + try { + ev = Q->submit([&](handler &cgh) { + if (DepEvents) + for (size_t i = 0; i < DepEventsCount; ++i) { + event *ei = unwrap(DepEvents[i]); + if (ei) + cgh.depends_on(*ei); + } + + cgh.memcpy(Dest, Src, Count); + }); + } catch (const std::exception &ex) { + error_handler(ex, __FILE__, __func__, __LINE__); + return nullptr; + } + } + else { + error_handler("QRef passed to memcpy was NULL.", __FILE__, __func__, + __LINE__); + return nullptr; + } + + return wrap(new event(ev)); +} + __dpctl_give DPCTLSyclEventRef DPCTLQueue_Prefetch(__dpctl_keep DPCTLSyclQueueRef QRef, const void *Ptr, diff --git a/libsyclinterface/tests/CMakeLists.txt b/libsyclinterface/tests/CMakeLists.txt index 4cfd30338d..472e1787fa 100644 --- a/libsyclinterface/tests/CMakeLists.txt +++ b/libsyclinterface/tests/CMakeLists.txt @@ -21,17 +21,39 @@ foreach(tf ${spirv-test-files}) file(COPY ${tf} DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) endforeach() -if(DPCTL_GENERATE_COVERAGE) - file(GLOB_RECURSE - sources ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp - ) +file(GLOB_RECURSE + sources ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp +) - # Add all dpctl sources into a single executable so that we can run coverage - # analysis and generate a report. - add_executable(dpctl_c_api_tests - EXCLUDE_FROM_ALL - ${sources} - ) +# Add all dpctl sources into a single executable so that we can run coverage +# analysis and generate a report. +add_executable(dpctl_c_api_tests + EXCLUDE_FROM_ALL + ${sources} +) +add_sycl_to_target( + TARGET dpctl_c_api_tests + SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/test_helper.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_context_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_invalid_filters.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_subdevices.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_manager.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_selector_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_aspects.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_event_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_platform_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_kernel_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_kernel_bundle_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_platform_invalid_filters.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_manager.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_submit.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_usm_interface.cpp +) + +if(DPCTL_GENERATE_COVERAGE) target_include_directories(dpctl_c_api_tests PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../helper/include" PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../include" @@ -40,7 +62,7 @@ if(DPCTL_GENERATE_COVERAGE) ${CMAKE_THREAD_LIBS_INIT} GTest::GTest DPCTLSyclInterface - ${IntelSycl_OPENCL_LIBRARY} + ${IntelSyclCompiler_OPENCL_LIBRARY} ${CMAKE_DL_LIBS} ) set(object_arg "-object;") @@ -90,13 +112,11 @@ if(DPCTL_GENERATE_COVERAGE) DEPENDS dpctl_c_api_tests ) else() - file(GLOB_RECURSE sources ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) - add_executable(dpctl_c_api_tests EXCLUDE_FROM_ALL ${sources}) target_link_libraries(dpctl_c_api_tests ${CMAKE_THREAD_LIBS_INIT} GTest::GTest DPCTLSyclInterface - ${IntelSycl_OPENCL_LIBRARY} + ${IntelSyclCompiler_OPENCL_LIBRARY} ) endif() diff --git a/libsyclinterface/tests/test_sycl_queue_interface.cpp b/libsyclinterface/tests/test_sycl_queue_interface.cpp index 8d23929d39..836a87379b 100644 --- a/libsyclinterface/tests/test_sycl_queue_interface.cpp +++ b/libsyclinterface/tests/test_sycl_queue_interface.cpp @@ -340,6 +340,10 @@ TEST(TestDPCTLSyclQueueInterface, CheckMemOpsZeroQRef) ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Memcpy(QRef, p1, p2, n_bytes)); ASSERT_FALSE(bool(ERef)); + ASSERT_NO_FATAL_FAILURE( + ERef = DPCTLQueue_MemcpyWithEvents(QRef, p1, p2, n_bytes, NULL, 0)); + ASSERT_FALSE(bool(ERef)); + ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Prefetch(QRef, p1, n_bytes)); ASSERT_FALSE(bool(ERef)); @@ -391,6 +395,10 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckMemOpsNullPtr) ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Memcpy(QRef, p1, p2, n_bytes)); ASSERT_FALSE(bool(ERef)); + ASSERT_NO_FATAL_FAILURE( + ERef = DPCTLQueue_MemcpyWithEvents(QRef, p1, p2, n_bytes, NULL, 0)); + ASSERT_FALSE(bool(ERef)); + ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Prefetch(QRef, p1, n_bytes)); if (ERef) { ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); @@ -450,6 +458,38 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckMemset) delete[] host_arr; } +TEST_P(TestDPCTLQueueMemberFunctions, CheckMemset2) +{ + DPCTLSyclUSMRef p = nullptr; + DPCTLSyclEventRef Memset_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; + uint8_t val = 42; + size_t nbytes = 256; + uint8_t *host_arr = new uint8_t[nbytes]; + + ASSERT_FALSE(host_arr == nullptr); + + ASSERT_NO_FATAL_FAILURE(p = DPCTLmalloc_device(nbytes, QRef)); + ASSERT_FALSE(p == nullptr); + + ASSERT_NO_FATAL_FAILURE( + Memset_ERef = DPCTLQueue_Memset(QRef, (void *)p, val, nbytes)); + + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Memset_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); + + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memset_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); + + ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); + + for (size_t i = 0; i < nbytes; ++i) { + ASSERT_TRUE(host_arr[i] == val); + } + delete[] host_arr; +} + TEST(TestDPCTLSyclQueueInterface, CheckFillNullQRef) { DPCTLSyclQueueRef QRef = nullptr; @@ -481,7 +521,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill8) { using T = uint8_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill8_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val = static_cast(0xB); size_t nelems = 256; T *host_arr = new T[nelems]; @@ -492,17 +533,15 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill8) ASSERT_NO_FATAL_FAILURE(p = DPCTLmalloc_device(nbytes, QRef)); ASSERT_FALSE(p == nullptr); - ASSERT_NO_FATAL_FAILURE(ERef = + ASSERT_NO_FATAL_FAILURE(Fill8_ERef = DPCTLQueue_Fill8(QRef, (void *)p, val, nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill8_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill8_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); @@ -517,7 +556,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill16) using T = uint16_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill16_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val = static_cast(0xAB); size_t nelems = 256; T *host_arr = new T[nelems]; @@ -529,16 +569,14 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill16) ASSERT_FALSE(p == nullptr); ASSERT_NO_FATAL_FAILURE( - ERef = DPCTLQueue_Fill16(QRef, (void *)p, val, nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + Fill16_ERef = DPCTLQueue_Fill16(QRef, (void *)p, val, nelems)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill16_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill16_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); @@ -553,7 +591,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill32) using T = uint32_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill32_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val = static_cast(0xABCD); size_t nelems = 256; T *host_arr = new T[nelems]; @@ -565,16 +604,14 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill32) ASSERT_FALSE(p == nullptr); ASSERT_NO_FATAL_FAILURE( - ERef = DPCTLQueue_Fill32(QRef, (void *)p, val, nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + Fill32_ERef = DPCTLQueue_Fill32(QRef, (void *)p, val, nelems)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill32_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill32_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); @@ -589,7 +626,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill64) using T = uint64_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill64_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val = static_cast(0xABCDEF73); size_t nelems = 256; T *host_arr = new T[nelems]; @@ -601,16 +639,14 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill64) ASSERT_FALSE(p == nullptr); ASSERT_NO_FATAL_FAILURE( - ERef = DPCTLQueue_Fill64(QRef, (void *)p, val, nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + Fill64_ERef = DPCTLQueue_Fill64(QRef, (void *)p, val, nelems)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill64_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill64_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); @@ -639,7 +675,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill128) using T = value128_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill128_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val{static_cast(0xABCDEF73), static_cast(0x3746AF05)}; size_t nelems = 256; T *host_arr = new T[nelems]; @@ -651,17 +688,15 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill128) ASSERT_FALSE(p == nullptr); ASSERT_NO_FATAL_FAILURE( - ERef = DPCTLQueue_Fill128(QRef, (void *)p, - reinterpret_cast(&val), nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + Fill128_ERef = DPCTLQueue_Fill128( + QRef, (void *)p, reinterpret_cast(&val), nelems)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill128_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill128_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); diff --git a/setup.py b/setup.py index f6780633cc..eb942a71b4 100644 --- a/setup.py +++ b/setup.py @@ -176,9 +176,31 @@ def _get_cmdclass(): "dpctl.utils", ], package_data={ - "dpctl": ["tests/*.*", "tests/helper/*.py", "tests/elementwise/*.py"] + "dpctl": [ + "tests/*.*", + "tests/helper/*.py", + "tests/elementwise/*.py", + "tests/*.pyx", + "tests/input_files/*", + "resources/cmake/*.cmake", + "include/*.h*", + "include/syclinterface/*.h*", + "include/syclinterface/Config/*.h", + "include/syclinterface/Support/*.h", + "tensor/libtensor/include/kernels/*.h*", + "tensor/libtensor/include/utils/*.h*", + "tensor/include/dlpack/*.*", + "_sycl*.h", + "memory/_memory*.h", + "program/_program*.h", + "tensor/_usmarray*.h", + "*.pxd", + "memory/*.pxd", + "tensor/*.pxd", + "program/*.pxd", + ] }, - include_package_data=True, + include_package_data=False, zip_safe=False, setup_requires=["Cython"], install_requires=[