diff --git a/.gitignore b/.gitignore index 1c0750848b..f4f1cd0dc1 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,11 @@ *.a *.exe *.gch -build/ -build-*/ +/*.sublime-* +/build/ +/build_*/ +/build-*/ +/install/ +/install_*/ +/install-*/ /Debug/ diff --git a/.travis.yml b/.travis.yml index dba6d536dd..23e1ea44e1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -29,28 +29,17 @@ matrix: env: - COMPILER=g++ - IMG=gcc8 - - CMAKE_EXTRA_FLAGS="-DENABLE_WARNINGS=On -DENABLE_TBB=On" + - CMAKE_EXTRA_FLAGS="-DENABLE_WARNINGS=On -DENABLE_TBB=On -DRAJA_ENABLE_BOUNDS_CHECK=ON" - compiler: clang9 env: - COMPILER=clang++-9 - IMG=clang9 - CMAKE_EXTRA_FLAGS="-DCMAKE_CXX_FLAGS=-fmodules -DENABLE_TBB=On" - - compiler: clang5 - env: - - COMPILER=clang++ - - IMG=clang5 - - CMAKE_EXTRA_FLAGS="-DCMAKE_CXX_FLAGS=-fmodules -DENABLE_TBB=On" - compiler: intel18 env: - COMPILER=/opt/intel/bin/icpc - IMG=icc18 - - CMAKE_EXTRA_FLAGS="-DENABLE_TBB=On" - - compiler: nvcc9 - env: - - COMPILER=g++ - - IMG=nvcc9 - - CMAKE_EXTRA_FLAGS="-DENABLE_CUDA=On -DENABLE_TBB=On" - - DO_TEST=no + - CMAKE_EXTRA_FLAGS="-DENABLE_FORCEINLINE_RECURSIVE=Off -DENABLE_TBB=On" - compiler: nvcc10.2 env: - COMPILER=g++ @@ -86,7 +75,7 @@ matrix: - COMPILER=g++ - IMG=hip - HCC_AMDGPU_TARGET=gfx900 - - CMAKE_EXTRA_FLAGS="-DENABLE_HIP=On -DENABLE_OPENMP=Off -DENABLE_CUDA=Off -DENABLE_WARNINGS_AS_ERRORS=Off" + - CMAKE_EXTRA_FLAGS="-DENABLE_HIP=On -DENABLE_OPENMP=Off -DENABLE_CUDA=Off -DENABLE_WARNINGS_AS_ERRORS=Off -DHIP_HIPCC_FLAGS=-fPIC" - DO_TEST=no diff --git a/CMakeLists.txt b/CMakeLists.txt index e2b9d48137..e85ed1f485 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,9 +8,13 @@ cmake_policy(SET CMP0042 NEW) cmake_policy(SET CMP0048 NEW) +if (APPLE) + cmake_policy(SET CMP0025 NEW) +endif() + # Set version number set(RAJA_VERSION_MAJOR 0) -set(RAJA_VERSION_MINOR 11) +set(RAJA_VERSION_MINOR 12) set(RAJA_VERSION_PATCHLEVEL 0) if (RAJA_LOADED AND (NOT RAJA_LOADED STREQUAL "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}")) @@ -47,7 +51,6 @@ set(ENABLE_GTEST_DEATH_TESTS On CACHE BOOL "Enable tests asserting failure.") set(RAJA_CXX_STANDARD_FLAG "default" CACHE STRING "Specific c++ standard flag to use, default attempts to autodetect the highest available") option(ENABLE_TBB "Build TBB support" Off) -option(ENABLE_CHAI "Build CHAI support" Off) option(ENABLE_TARGET_OPENMP "Build OpenMP on target device support" Off) option(ENABLE_CLANG_CUDA "Use Clang's native CUDA support" Off) option(ENABLE_EXTERNAL_CUB "Use an external cub for scans" Off) @@ -64,6 +67,7 @@ option(ENABLE_FORCEINLINE_RECURSIVE "Enable Forceinline recursive (only supporte option(ENABLE_BENCHMARKS "Build benchmarks" Off) option(RAJA_DEPRECATED_TESTS "Test deprecated features" Off) option(RAJA_ENABLE_BOUNDS_CHECK "Enable bounds checking in RAJA::Views/Layouts" Off) +option(RAJA_TEST_EXHAUSTIVE "Build RAJA exhaustive tests" Off) set(TEST_DRIVER "" CACHE STRING "driver used to wrap test commands") @@ -72,7 +76,7 @@ cmake_minimum_required(VERSION 3.9) if (ENABLE_CUDA) if (DEFINED CUDA_ARCH) if (CUDA_ARCH MATCHES "^sm_*") - if ("${CUDA_ARCH}" STRLESS "sm_35") + if ("${CUDA_ARCH}" STRLESS "sm_35") message( FATAL_ERROR "RAJA requires minimum CUDA compute architecture of sm_35") endif() endif() @@ -85,7 +89,7 @@ if (ENABLE_CUDA) message(STATUS "CUDA compute architecture set to RAJA default sm_35 since it was not specified") set(CUDA_ARCH "sm_35" CACHE STRING "Set CUDA_ARCH to RAJA minimum supported" FORCE) endif() - if (CMAKE_CXX_COMPILER_ID MATCHES GNU) + if ( (CMAKE_CXX_COMPILER_ID MATCHES GNU) AND (CMAKE_SYSTEM_PROCESSOR MATCHES ppc64le) ) if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.0) set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -mno-float128") endif () @@ -157,7 +161,9 @@ set (raja_sources src/LockFreeIndexSetBuilders.cpp src/MemUtils_CUDA.cpp src/MemUtils_HIP.cpp - src/PluginStrategy.cpp) + src/PluginStrategy.cpp + src/RuntimePluginLoader.cpp + src/KokkosPluginLoader.cpp) set (raja_depends) @@ -182,7 +188,7 @@ if (ENABLE_CUDA) if (CUB_FOUND) blt_register_library( NAME cub - INCLUDES ${CUB_INCLUDE_DIRS}) + INCLUDES $) set(raja_depends ${raja_depends} cub) @@ -216,38 +222,34 @@ if (ENABLE_HIP) endif () endif () -if (ENABLE_CHAI) - set (raja_depends - ${raja_depends} - chai) -endif () - if (ENABLE_TBB) set(raja_depends ${raja_depends} tbb) endif () -set(EXTERNAL_CAMP_SOURCE_DIR "" CACHE FILEPATH "build with a specific external +if (NOT TARGET camp) + set(EXTERNAL_CAMP_SOURCE_DIR "" CACHE FILEPATH "build with a specific external camp source repository") -if (EXTERNAL_CAMP_SOURCE_DIR) - message(STATUS "Using external source CAMP from: " ${EXTERNAL_CAMP_SOURCE_DIR}) - add_subdirectory(${EXTERNAL_CAMP_SOURCE_DIR} - ${CMAKE_CURRENT_BINARY_DIR}/tpl/camp) -else (EXTERNAL_CAMP_SOURCE_DIR) - find_package(camp QUIET) - if (NOT camp_FOUND) - message(STATUS "Using RAJA CAMP submodule.") - add_subdirectory(tpl/camp) - else (NOT camp_FOUND) - message(STATUS "Using installed CAMP from: ${camp_INSTALL_PREFIX}") - endif(NOT camp_FOUND) -endif (EXTERNAL_CAMP_SOURCE_DIR) + if (EXTERNAL_CAMP_SOURCE_DIR) + message(STATUS "Using external source CAMP from: " ${EXTERNAL_CAMP_SOURCE_DIR}) + add_subdirectory(${EXTERNAL_CAMP_SOURCE_DIR} + ${CMAKE_CURRENT_BINARY_DIR}/tpl/camp) + else (EXTERNAL_CAMP_SOURCE_DIR) + find_package(camp QUIET) + if (NOT camp_FOUND) + message(STATUS "Using RAJA CAMP submodule.") + add_subdirectory(tpl/camp) + else (NOT camp_FOUND) + message(STATUS "Using installed CAMP from: ${camp_INSTALL_PREFIX}") + endif(NOT camp_FOUND) + endif (EXTERNAL_CAMP_SOURCE_DIR) +endif (NOT TARGET camp) blt_add_library( NAME RAJA SOURCES ${raja_sources} - DEPENDS_ON ${raja_depends} camp) + DEPENDS_ON ${raja_depends} camp ${CMAKE_DL_LIBS}) install(TARGETS RAJA EXPORT RAJA @@ -262,9 +264,11 @@ target_include_directories(RAJA PUBLIC $ $ - $ - $ $) +target_include_directories(RAJA SYSTEM + PUBLIC + $ + $) install(DIRECTORY include/ DESTINATION include FILES_MATCHING PATTERN *.hpp) if(NOT ENABLE_EXTERNAL_CUB) diff --git a/README.md b/README.md index 6f0ff8f405..2dd606b396 100644 --- a/README.md +++ b/README.md @@ -71,14 +71,14 @@ submodule or as an installed library. User Documentation ------------------- -The [**RAJA User Guide and Tutorial**](http://raja.readthedocs.io/en/master/) +The [**RAJA User Guide and Tutorial**](http://raja.readthedocs.io/en/main/) is the best place to start learning about RAJA and how to use it. To cite RAJA, please use the following references: * RAJA Performance Portability Layer. https://github.com/LLNL/RAJA -* D. A. Beckingsale, J. Burmark, R. Hornung, H. Jones, W. Killian, A. J. Kunen, O. Pearce, P. Robinson, B. S. Ryujin, T. R. W. Scogland, "RAJA: Porrtable Performance for Large-Scale Scientific Applications", 2019 IEEE/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC). [Download here](https://conferences.computer.org/sc19w/2019/#!/toc/14) +* D. A. Beckingsale, J. Burmark, R. Hornung, H. Jones, W. Killian, A. J. Kunen, O. Pearce, P. Robinson, B. S. Ryujin, T. R. W. Scogland, "RAJA: Portable Performance for Large-Scale Scientific Applications", 2019 IEEE/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC). [Download here](https://conferences.computer.org/sc19w/2019/#!/toc/14) Related Software -------------------- @@ -114,7 +114,7 @@ The RAJA team follows the [GitFlow](http://nvie.com/posts/a-successful-git-branc include their work in a feature branch created from the RAJA `develop` branch. Then, create a pull request with the `develop` branch as the destination. That branch contains the latest work in RAJA. Periodically, we will merge the -develop branch into the `master` branch and tag a new release. +develop branch into the `main` branch and tag a new release. Authors ----------- diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index b4a5466baa..4ffff68c02 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -10,6 +10,110 @@ Version vxx.yy.zz -- Release date 20yy-mm-dd ============================================ +Version v0.12.0 -- Release date 2020-09-03 +============================================ + +This release contains new features, notable changes, and bug fixes. Please +see the RAJA user guide for more information about items in this release. + +Notable changes include: + + * Notable repository change: + * The 'master' branch in the RAJA git repo has been renamed to 'main'. + + * New features: + * New RAJA "work group" capability added. This allows multiple GPU + kernels to be fused into one kernel launch, greatly reducing the + run time overhead of launching CUDA kernels. + * Added support for dynamic plug-ins in RAJA, which enable the use of + things like Kokkos Performance Profiline Tools to be used with RAJA + (https://github.com/kokkos/kokkos-tools) + * Added ability to pass a resource object to RAJA::forall methods to + enable asynchronous execution for CUDA and HIP back-ends. + * Added "Multi-view" that works like a regular view, except that it + can wrap multiple arrays so their accesses can share index arithmetic. + * Multiple sort algorithms added. This provides portable parallel sort + operations, which are basic parallel algorithm building blocks. + * Introduced RAJA "Teams" concept as an experimental feature. This + enables hierarchical parallelism and additional nested loop patterns + beyond what RAJA::kernel supports. Please note that this is very much + a work-in-progress and is not yet documented in the user guide. + * Added initial support for dynamic loop tiling. + * New OpenMP execution policies added to support static, dynamic, and + guided scheduling. + * Added support for const iterators to be used with RAJA scans. + * Support for bitwise and and or reductions have been added. + * The RAJA::kernel interface has been expanded to allow only segment + index arguments used in a lambda to be passed to the lambda. In + previous versions of RAJA, every lambda invoked in a kernel had to + accept an index argument for every segment in the segment tuple passed + to RAJA::kernel execution templates, even if not all segment indices + were used in a lambda. This release still allows that usage pattern. + The new capability requires an additional template parameter to be + passed to the RAJA::statement::Lambda type, which identify the segment + indices that will be passed and in which order. + + * API Changes: + * The RAJA 'VarOps' namespace has been removed. All entities previously + in that namespace are now in the 'RAJA' namespace. + * RAJA span is now public for users to access and has been made more like + std::span. + * RAJA::statement::tile_fixed has been moved to RAJA::tile_fixed + (namespace change). + * RAJA::statement::{Segs, Offsets, Params, ValuesT} have been moved to + RAJA::{Segs, Offsets, Params, ValuesT} (namespace change). + * RAJA ListSegment constructors have been expanded to accept a camp + Resource object. This enables run time specification of the memory + space where the data for list segment indices will live. In earlier + RAJA versions, the space in which list segment index data lived was a + compile-time choice based on whether CUDA or HIP was enabled and the + data resided in unified memory for either case. This is still supported + in this release, but is marked as a DEPRECATED FEATURE. In the next RAJA + release, ListSegment construction will require a camp Resource object. + When compiling RAJA with your application, you will see deprecation + warnings if you are using the deprecated ListSegment constructor. + * A reset method was added to OpenMP target offload reduction classes + so they contain the same functionality as reductions for all other + back-ends. + + * Build changes/improvements: + * The BLT, camp, CUB, and rocPRIM submodules have all been updated to + more recent versions. Please note that RAJA now requires rocm version + 3.5 or newer to use the HIP back-end. + * Build for clang9 on macosx has been fixed. + * Build for Intel19 on Windows has been fixed. + * Host/device annotations have been added to reduction operations to + eliminate compiler warnings for certain use cases. + * Several warnings generated by the MSVC compiler have been eliminated. + * A couple of PGI compiler warnings have been removed. + * CMake improvements to make it is easier to use an external camp or + CUB library with RAJA. + * Note that the RAJA tests are undergoing a substantial overhaul. Users, + who chose to build and run RAJA tests, should know that many tests + are now being generated in the build space directory structure which + mimics the RAJA source directory structure. As a result, only some + test executables appear in the top-level 'test' subdirectory of the + build directory; others can be found in lower-level directories. The + reason for this change is to reduce test build times for certain + compilers. + + * Bug fixes: + * An issue with SIMD privatization with the Intel compiler, required + to generate correct code, has been fixed. + * An issue with the atomicExchange() operation for the RAJA HIP back-end + has been fixed. + * A type issue in the RAJA::kernel implementation involving RAJA span + usage has been fixed. + * Checks for iterator ranges and container sizes have been added to + RAJA scans, which fixes an issue when users attempted to run a + scan over a range of size zero. + * Several type errors in the Layout.hpp header file have been fixed. + * Several fixes have been made in the Layout and Static Layout types. + * Several fixes have been made to the OpenMP target offload back-end + to address host-device memory issues. + * A variety of RAJA User Guide issues have been addressed, as well as + issues in RAJA example codes. + Version v0.11.0 -- Release date 2020-01-29 ========================================== @@ -85,7 +189,7 @@ Notable changes include: * Added a bounds checking option to RAJA Layout types as a debugging feature. This is a compile-time option that will report user errors when given View or Layout indices are out-of-bounds. See View/Layout - section in the RAjA User Guide for instructions on enabling this and + section in the RAJA User Guide for instructions on enabling this and how this feature works. * We've added a RAJA Template Project on GitHub, which shows how to use RAJA in an application, either as a Git submodule or as an diff --git a/blt b/blt index 2c192774b5..bc20f6ab51 160000 --- a/blt +++ b/blt @@ -1 +1 @@ -Subproject commit 2c192774b587c245ec2d7022b2e862395ffa8a21 +Subproject commit bc20f6ab51be6055d8e7ecc3d83e87dc254c7af6 diff --git a/cmake/RAJAMacros.cmake b/cmake/RAJAMacros.cmake index 70cadfc169..0d26065854 100644 --- a/cmake/RAJAMacros.cmake +++ b/cmake/RAJAMacros.cmake @@ -49,6 +49,50 @@ macro(raja_add_executable) ) endmacro(raja_add_executable) +macro(raja_add_plugin_library) + set(options ) + set(singleValueArgs NAME SHARED) + set(multiValueArgs SOURCES DEPENDS_ON) + + cmake_parse_arguments(arg + "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN}) + + list(APPEND arg_DEPENDS_ON RAJA) + + if (ENABLE_OPENMP) + list (APPEND arg_DEPENDS_ON openmp) + endif () + + if (ENABLE_CUDA) + list (APPEND arg_DEPENDS_ON cuda) + endif () + + if (ENABLE_HIP) + list (APPEND arg_DEPENDS_ON hip) + endif () + + if (ENABLE_TBB) + list (APPEND arg_DEPENDS_ON tbb) + endif () + + blt_add_library( + NAME ${arg_NAME} + SOURCES ${arg_SOURCES} + DEPENDS_ON ${arg_DEPENDS_ON} + SHARED ${arg_SHARED} + ) + + #target_include_directories(${arg_NAME} + #PUBLIC + #$ + #$ + #$ + #$ + #$ + #$) + +endmacro(raja_add_plugin_library) + macro(raja_add_test) set(options ) set(singleValueArgs NAME) diff --git a/cmake/SetupCompilers.cmake b/cmake/SetupCompilers.cmake index 2c5d4d6f5c..eb9ec9d2f2 100644 --- a/cmake/SetupCompilers.cmake +++ b/cmake/SetupCompilers.cmake @@ -41,7 +41,7 @@ if ( MSVC ) endif() if (ENABLE_CUDA) - set(CMAKE_CUDA_STANDARD 11) + set(CMAKE_CUDA_STANDARD "11" CACHE STRING "Version of C++ standard for CUDA Builds") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -restrict -arch ${CUDA_ARCH} --expt-extended-lambda --expt-relaxed-constexpr -Xcudafe \"--display_error_number\"") if (NOT RAJA_HOST_CONFIG_LOADED) diff --git a/cmake/SetupRajaConfig.cmake b/cmake/SetupRajaConfig.cmake index f941fa1578..589ba2f84a 100644 --- a/cmake/SetupRajaConfig.cmake +++ b/cmake/SetupRajaConfig.cmake @@ -26,6 +26,7 @@ endif() ## Fault tolerance options option(ENABLE_FT "Enable fault-tolerance features" OFF) option(RAJA_REPORT_FT "Report on use of fault-tolerant features" OFF) +option(ENABLE_ITERATOR_OVERFLOW_DEBUG "Enable Overflow checking during Iterator operations" OFF) ## Timer options set(RAJA_TIMER "chrono" CACHE STRING @@ -62,6 +63,8 @@ set(RAJA_ENABLE_CLANG_CUDA ${ENABLE_CLANG_CUDA}) set(RAJA_ENABLE_HIP ${ENABLE_HIP}) set(RAJA_ENABLE_CUB ${ENABLE_CUB}) +option(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL "Enable use of device function pointers in hip backend" OFF) + # Configure a header file with all the variables we found. configure_file(${PROJECT_SOURCE_DIR}/include/RAJA/config.hpp.in ${PROJECT_BINARY_DIR}/include/RAJA/config.hpp) diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index 5ff2012999..642e8db256 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -5,7 +5,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -add_custom_target(docs) +add_custom_target(raja-docs) if (SPHINX_FOUND) add_subdirectory(sphinx/user_guide) diff --git a/docs/doxygen/CMakeLists.txt b/docs/doxygen/CMakeLists.txt index a1f30e42a2..2c83933591 100644 --- a/docs/doxygen/CMakeLists.txt +++ b/docs/doxygen/CMakeLists.txt @@ -21,5 +21,5 @@ add_custom_target(raja-doxygen install(DIRECTORY ${DOXYGEN_HTML_DIR} DESTINATION "docs/doxygen/" OPTIONAL) -add_dependencies(docs +add_dependencies(raja-docs raja-doxygen) diff --git a/docs/sphinx/user_guide/CMakeLists.txt b/docs/sphinx/user_guide/CMakeLists.txt index 0245504b81..be40a41372 100644 --- a/docs/sphinx/user_guide/CMakeLists.txt +++ b/docs/sphinx/user_guide/CMakeLists.txt @@ -23,5 +23,5 @@ add_custom_target(raja-userguide-sphinx install(DIRECTORY "${SPHINX_HTML_DIR}" DESTINATION "docs/user_guide/sphinx/" OPTIONAL) -add_dependencies(docs +add_dependencies(raja-docs raja-userguide-sphinx) diff --git a/docs/sphinx/user_guide/conf.py b/docs/sphinx/user_guide/conf.py index c024360e4a..25b3bb2d91 100644 --- a/docs/sphinx/user_guide/conf.py +++ b/docs/sphinx/user_guide/conf.py @@ -66,9 +66,9 @@ # built documents. # # The short X.Y version. -version = u'0.9' +version = u'0.12' # The full version, including alpha/beta/rc tags. -release = u'0.9.0' +release = u'0.12.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/sphinx/user_guide/config_options.rst b/docs/sphinx/user_guide/config_options.rst index c369f94c9b..8570e09d91 100644 --- a/docs/sphinx/user_guide/config_options.rst +++ b/docs/sphinx/user_guide/config_options.rst @@ -38,21 +38,24 @@ the top-level RAJA directory:: $ make install Following CMake conventions, RAJA supports three build types: ``Release``, -``RelWithDebInfo``, and ``Debug``. Similar to other CMake systems, when you -choose a build type that includes debug information, you do not have to specify -the '-g' compiler flag to generate debugging symbols. +``RelWithDebInfo``, and ``Debug``. With CMake, compiler flags for each of +these build types are applied automatically and so you do not have to +specify them. However, if you want to apply other compiler flags, you will +need to do that using appropriate CMake variables. -All RAJA options are set like standard CMake variables. All RAJA settings for +All RAJA options are set like regular CMake variables. RAJA settings for default options, compilers, flags for optimization, etc. can be found in files -in the ``RAJA/cmake`` directory. Configuration variables can be set by passing +in the ``RAJA/cmake`` directory and top-level ``CMakeLists.txt`` file. +Configuration variables can be set by passing arguments to CMake on the command line when CMake is called, or by setting -options in a CMake cache file and passing that file to CMake. For example, -to enable RAJA OpenMP functionality, pass the following argument to cmake:: +options in a CMake *cache file* and passing that file to CMake using the +CMake ``-C`` options. For example, to enable RAJA OpenMP functionality, +pass the following argument to CMake:: -DENABLE_OPENMP=On The RAJA repository contains a collection of CMake cache files -(or 'host-config' files) that may be used as a guide for users trying +(we call them *host-config* files) that may be used as a guide for users trying to set their own options. See :ref:`configopt-raja-hostconfig-label`. Next, we summarize RAJA options and their defaults. @@ -80,18 +83,19 @@ and their default settings: * **Examples, tests, warnings, etc.** - Variables that control whether RAJA tests and examples are built when - the library is compiled are: + Variables that control whether RAJA tests, examples, or tutorial + exercises are built when RAJA is compiled: ====================== ====================== Variable Default ====================== ====================== ENABLE_TESTS On ENABLE_EXAMPLES On + ENABLE_EXERCISES On ====================== ====================== RAJA can also be configured to build with compiler warnings reported as - errors, which may be useful when using RAJA in an application: + errors, which may be useful to make sure your application builds cleanly: ========================= ====================== Variable Default @@ -100,51 +104,57 @@ and their default settings: ========================= ====================== RAJA Views/Layouts may be configured to check for out of bounds - indexing: + indexing at runtime: + ========================= ====================== Variable Default ========================= ====================== RAJA_ENABLE_BOUNDS_CHECK Off ========================= ====================== + + Note that RAJA bounds checking is a runtime check and will add + execution time overhead. Thus, this feature should not be enabled + for release builds. * **Programming model back-ends** Variables that control which RAJA programming model back-ends are enabled are (names are descriptive of what they enable): - ====================== ====================== + ====================== ============================================ Variable Default - ====================== ====================== + ====================== ============================================ ENABLE_OPENMP On - ENABLE_TARGET_OPENMP Off - ENABLE_CUDA Off + ENABLE_TARGET_OPENMP Off (when on, ENABLE_OPENMP must also be on) ENABLE_TBB Off - ====================== ====================== + ENABLE_CUDA Off + ENABLE_HIP Off + ====================== ============================================ Other compilation options are available via the following: - ====================== ====================== + ====================== ========================================== Variable Default - ====================== ====================== - ENABLE_CLANG_CUDA Off + ====================== ========================================== + ENABLE_CLANG_CUDA Off (when on, ENABLE_CUDA must also be on) ENABLE_CUB On (when CUDA enabled) - ====================== ====================== + CUDA_ARCH sm_35 (set based on hardware support) + ====================== ========================================== Turning the 'ENABLE_CLANG_CUDA' variable on will build CUDA code with - the native support in the Clang compiler. When using it, the - 'ENABLE_CUDA' variable must also be turned on. + the native support in the Clang compiler. The 'ENABLE_CUB' variable is used to enable NVIDIA CUB library support for RAJA CUDA scans. Since the CUB library is included in RAJA as a - Git submodule, users should not have to set this in most scenarios. + Git submodule, users should not have to set this in most cases. -.. note:: See :ref:`configopt-raja-backends-label` for more information about - setting compiler flags and other options for RAJA back-ends. +.. note:: See :ref:`getting-started-label` for more information about + setting other options for RAJA back-ends. * **Data types, sizes, alignment, etc.** RAJA provides type aliases that can be used to parameterize floating - point types in applications, which makes it easy to switch between types. + point types in applications, which makes it easier to switch between types. The following variables are used to set the data type for the type alias ``RAJA::Real_type``: @@ -210,32 +220,20 @@ and their default settings: attributes in a typedef. ============================= ======================================== - RAJA internally uses parameters to define platform-specific constants - for index ranges and data alignment. The variables that control these - are: + RAJA internally uses a parameter to define platform-specific constant + data alignment. The variable that control this is: ============================= ====================== Variable Default ============================= ====================== - RAJA_RANGE_ALIGN 4 - RAJA_RANGE_MIN_LENGTH 32 RAJA_DATA_ALIGN 64 ============================= ====================== - What these variables mean: + What this variable means: ============================= ======================================== Variable Meaning ============================= ======================================== - RAJA_RANGE_ALIGN Constrain alignment of begin/end indices - of range segments generated by index set - builder methods; i.e., begin and end - indices of such segments will be - multiples of this value. - RAJA_RANGE_MIN_LENGTH Sets minimum length of range segments - generated by index set builder methods. - This should be an integer multiple of - RAJA_RANGE_ALIGN. RAJA_DATA_ALIGN Specifies data alignment used in intrinsics and typedefs; units of **bytes**. @@ -250,8 +248,10 @@ and their default settings: example codes to determine execution timing and can be used in other apps as well. This timer can use any of three internal timers depending on your preferences, and one should be selected by setting the 'RAJA_TIMER' - variable. If the 'RAJA_CALIPER' variable is turned on (off by default), - the timer will also offer caliper-based region annotations. + variable. If the 'RAJA_USE_CALIPER' variable is turned on (off by default), + the timer will also offer Caliper-based region annotations. Information + about using Caliper can be found at + `Caliper `_ ====================== ====================== Variable Values @@ -299,72 +299,7 @@ and their default settings: Setting RAJA Back-End Features =============================== -To access compiler and hardware optimization features, it is often necessary -to pass options to a compiler. This sections describes how to do this and -which CMake variables to use for certain cases. - -* **OpenMP Compiler Options** - -The variable `OpenMP_CXX_FLAGS` is used to pass OpenMP-related flags to a -compiler. Option syntax follows the CMake *list* pattern. Here is an example -showing how to specify OpenMP target back-end options for NVIDIA GPUs using -the clang compiler as a CMake option:: - - cmake \ - .... - -DOpenMP_CXX_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda" - .... - -* **CUDA Compiler Options** - -When using the NVIDIA nvcc compiler for RAJA CUDA functionality, the variables: - - * CMAKE_CUDA_FLAGS_RELEASE - * CMAKE_CUDA_FLAGS_DEBUG - * CMAKE_CUDA_FLAGS_RELWITHDEBINFO - -which corresponding to the standard CMake build types are used to pass flags -to nvcc. - -.. note:: When nvcc must pass options to the host compiler, the arguments - can be included in these CMake variables. Each host compiler - option must be prepended with the `-Xcompiler` directive. - -To set the CUDA architecture level for the nvcc compiler, which should be -chosen based on the NVIDIA GPU hardware you are using, you can use the -`CUDA_ARCH` CMake variable. For example, the CMake option:: - - -DCUDA_ARCH=sm_60 - -will tell the compiler to use the `sm_60` SASS architecture in its second -stage of compilation. It will pick the PTX architecture to use in the first -stage of compilation that is suitable for the SASS architecture you specify. - -Alternatively, you may specify the PTX and SASS architectures, using -appropriate nvcc options in the `CMAKE_CUDA_FLAGS_*` variables. - -.. note:: **RAJA requires a minimum CUDA architecture level of `sm_35` to use - all supported CUDA features.** Mostly, the architecture level affects - which RAJA CUDA atomic operations are available and how they are - implemented inside RAJA. This is described in :ref:`atomics-label`. - - * If you do not specify a value for `CUDA_ARCH`, it will be set to - `sm_35` and CMake will emit a status message indicatting this is - the case. - - * If you give a `CUDA_ARCH` value less than `sm_35` (e.g., `sm_30`), - CMake will report this and stop processing. - - -.. _configopt-raja-hostconfig-label: - -======================================= -RAJA Example Build Configuration Files -======================================= - -The ``RAJA/scripts`` directory contains subdirectories with a variety of -build scripts we use to build and test RAJA on various platforms with -various compilers. These scripts pass files (*CMake cache files*) in -the ``RAJA/host-configs`` directory to CMake using the '-C' option. -These files serve as useful examples of how to configure RAJA prior to -compilation. +Various `ENABLE_*` options are listed above for enabling RAJA back-ends, +such as OpenMP and CUDA. To access compiler and hardware optimization features, +it may be necessary to pass additional options to CMake. Please see +:ref:`getting_started-label` for more information. diff --git a/docs/sphinx/user_guide/contributing.rst b/docs/sphinx/user_guide/contributing.rst index fb511cad6d..a97d2123f5 100644 --- a/docs/sphinx/user_guide/contributing.rst +++ b/docs/sphinx/user_guide/contributing.rst @@ -12,9 +12,11 @@ Contributing to RAJA ==================== -This section is intended for folks who want to contribute new features or -bugfixes to RAJA. It assumes you are familiar with Git and GitHub. It -describes what a good pull request (PR) looks like, and the tests that your +RAJA is a collaborative open source software project and it embraces +contributions from others who want to add features or improve existing +features. This section is intended for folks who want to contribute new +features or bugfixes to RAJA. It assumes you are familiar with Git and GitHub. +It describes what a good pull request (PR) looks like, and the tests that your PR must pass before it can be merged into RAJA. ------------ @@ -22,8 +24,10 @@ Forking RAJA ------------ If you aren't a RAJA developer at LLNL, then you won't have permission to push -new branches to the repository. First, you should create a `fork of the repo -`_. This will create a copy +new branches to the repository. This is due to the policy adopted by the LLNL +organization on GitHub in which the RAJA project resides. Fortunately, you may +still contribute to RAJA by `forking the RAJA repo +`_. This will create a copy of the RAJA repository that you own, and will ensure you can push your changes to GitHub and create pull requests. @@ -33,7 +37,7 @@ Developing a New Feature New features should be based on the RAJA ``develop`` branch. When you want to create a new feature, first ensure you have an up-to-date copy of the -``develop`` branch: +``develop`` branch locally: .. code-block:: bash @@ -48,11 +52,12 @@ Then, create a new branch to develop your feature on: Proceed to develop your feature on this branch pushing changes with reasonably-sized atomic commits, and add tests that will exercise your new -code. If you are creating new methods or classes, please -add Doxygen documentation. +code. If you are creating new functionality, please add documentation to +the `RAJA User Guide `_. Once your feature is complete and your tests are passing, you can push your -branch to GitHub and create a PR. +branch to GitHub and create a PR. It will be reviewed by members of the +core RAJA team, who will provide comments, suggestions, etc. -------------------- Developing a Bug Fix @@ -62,7 +67,7 @@ First, check if the change you want to make has been addressed in the RAJA ``develop`` branch. If so, we suggest you either start using the ``develop`` branch, or temporarily apply the fix to whichever version of RAJA you are using. -Assuming there is an unsolved bug, first make sure you have an up-to-date copy +If there is an unresolved bug, first make sure you have an up-to-date copy of the ``develop`` branch: .. code-block:: bash @@ -86,16 +91,20 @@ Once you are finished, you can push your branch to GitHub, then create a PR. Creating a Pull Request ----------------------- -You can create a new PR `here `_. GitHub -has a good `guide `_ on +You can create a pull request (PR) +`here `_. GitHub has a good +`PR guide `_ on PR basics if you want more information. Ensure that your PR base is the ``develop`` branch of RAJA. -Add a descriptive title explaining the bug you fixed or the feature you have -added, and put a longer description of the changes you have made in the comment -box. +When you create a RAJA PR, you must enter basic information about the +contents of the PR and what it does in the PR summary. Add a descriptive title +explaining the bug you fixed or the feature you have added, and put a longer +description of the changes you have made in the comment box. This will help +reviewers understand your intent and provide a more useful review of your +work. -Once your PR has been created, it will be run through our automated tests and +After your PR has been created, it will be run through our automated tests and also be reviewed by RAJA team members. Providing the branch passes both the tests and reviews, it will be merged into RAJA. @@ -111,5 +120,11 @@ is used on a wide variety of systems with a number of configurations, and adding new tests helps ensure that all features work as expected across these environments. -All RAJA tests are in the ``RAJA/test`` directory and are split up by -programming model back-end and feature. +All RAJA tests are in the ``RAJA/test`` directory and are split into +*unit tests* and *functional tests*. Unit tests are intended to test basic +interfaces and features of individual classes, methods, etc. Functional tests +are used to test combinations of RAJA features. Please follow the implementation +pattern of existing tests. We have organized our tests to make it easy to see +what is being tested and easy to add new tests, for a new programming model +back-end, for example. + diff --git a/docs/sphinx/user_guide/developer_guide.rst b/docs/sphinx/user_guide/developer_guide.rst new file mode 100644 index 0000000000..2d1ecd1e59 --- /dev/null +++ b/docs/sphinx/user_guide/developer_guide.rst @@ -0,0 +1,74 @@ +.. developer_guide: + +=============== +Developer Guide +=============== + +Generating RAJA host-config files +=================================== + +.. note:: + This is optional if you are on LC machines, since some host-config files have already been generated (at least for Quartz and Lassen) and can be found in the ``host-configs`` repository directory. + +RAJA only directly depends on CMake. However, this mechanism will generate a cmake configuration file that reproduces the configuration `Spack ` would have generated in the same context. It contains all the information necessary to build RAJA with the described toolchain. + +In particular, the host config file will setup: +* flags corresponding with the target required (Release, Debug). +* compilers path, and other toolkits (cuda if required), etc. + +This provides an easy way to build RAJA based on `Spack ` and encapsulated in `Uberenv `_. + +Uberenv role +------------ + +Uberenv helps by doing the following: + +* Pulls a blessed version of Spack locally +* If you are on a known operating system (like TOSS3), we have defined compilers and system packages so you don't have to rebuild the world (CMake typically in RAJA). +* Overrides RAJA Spack packages with the local one if it exists. (see ``scripts/uberenv/packages``). +* Covers both dependencies and project build in one command. + +Uberenv will create a directory ``uberenv_libs`` containing a Spack instance with the required RAJA dependencies installed. It then generates a host-config file (``.cmake``) at the root of RAJA repository. + +Using Uberenv to generate the host-config file +---------------------------------------------- + +.. code-block:: bash + + $ python scripts/uberenv/uberenv.py + +.. note:: + On LC machines, it is good practice to do the build step in parallel on a compute node. Here is an example command: ``srun -ppdebug -N1 --exclusive python scripts/uberenv/uberenv.py`` + +Unless otherwise specified Spack will default to a compiler. It is recommended to specify which compiler to use: add the compiler spec to the ``--spec`` Uberenv command line option. + +On blessed systems, compiler specs can be found in the Spack compiler files in our repository: ``scripts/uberenv/spack_configs//compilers.yaml``. + +Some examples uberenv options: + +* ``--spec=%clang@9.0.0`` +* ``--spec=%clang@8.0.1+cuda`` +* ``--prefix=`` + +Building dependencies can take a long time. If you already have a spack instance you would like to reuse (in supplement of the local one managed by Uberenv), you can do so changing the uberenv command as follow: + +.. code-block:: bash + + $ python scripts/uberenv/uberenv.py --upstream=/opt/spack + +Using host-config files to build RAJA +------------------------------------- + +When a host-config file exists for the desired machine and toolchain, it can easily be used in the CMake build process: + +If I need to build RAJA with _clang_ and _cuda_ on _lassen_, I can see there is already a host-config file named `lassen-blueos_3_ppc64le_ib_p9-clang@8.0.1-cuda.cmake`. To use it (on lassen): + +.. code-block:: bash + + $ mkdir build && cd build + $ cmake -C ../host-configs/lassen-blueos_3_ppc64le_ib_p9-clang@8.0.1-cuda.cmake .. + $ cmake --build -j . + $ ctest --output-on-failure -T test + +.. note:: + This will build the default configuration. Not all parameters are embedded into the host-config file. For example, producing shared/static libraries, using OppenMP, enabling tests, is to be configured on command line. diff --git a/docs/sphinx/user_guide/feature/iteration_spaces.rst b/docs/sphinx/user_guide/feature/iteration_spaces.rst index a4ebff7eeb..7519e52ef2 100644 --- a/docs/sphinx/user_guide/feature/iteration_spaces.rst +++ b/docs/sphinx/user_guide/feature/iteration_spaces.rst @@ -36,18 +36,15 @@ Just like traditional C and C++ for-loops, RAJA uses index variables to identify loop iterates. Any lambda expression that represents all or part of a loop body passed to a ``RAJA::forall`` or ``RAJA::kernel`` method will take at least one loop index variable argument. RAJA iteration space types -and methods are templates that allow users to use any integral type for an +are templates that allow users to use any integral type for an index variable. The index variable type may be explicitly specified by a user. -RAJA also provides a ``RAJA::Index_type`` type, which is used as a default +RAJA also provides the ``RAJA::Index_type`` type, which is used as a default in some circumstances for convenience by allowing use of a common type alias to typed constructs without explicitly specifying the type. -The ``RAJA::Index_type`` type is an alias to the C++ type 'std::ptrdiff_t', +The ``RAJA::Index_type`` type is an alias to the C++ type ``std::ptrdiff_t``, which is appropriate for most compilers to generate useful loop-level optimizations. -.. note:: Users can change the type of ``RAJA::Index_type`` by editing the RAJA - ``RAJA/include/RAJA/util/types.hpp`` header file. - .. _segments-label: ------------- @@ -78,7 +75,7 @@ One can create an explicitly-typed range segment or one with the default RAJA::RangeSegment default_range(beg, end); .. note:: When using a RAJA range segment, no loop iterations will be run when - begin is greater-than-or-equal-to end. + begin is greater-than-or-equal-to end similar to a C-style for-loop. Strided Segments ^^^^^^^^^^^^^^^^^^^ @@ -139,6 +136,18 @@ segment constructor. For example:: // Create list segment with these loop indices RAJA::TypedListSegment idx_list( &idx[0], static_cast(idx.size()) ); +Using a list segment in a RAJA loop traversal template will run the loop +indices specified in the array passed to the list segment constructor. That +is, using 'idx_list' from above:: + + RAJA::forall< RAJA::seq_exec >( idx_list, [=] (RAJA::Index_type i) { + printf("%ld ", i); + } ); + +will print the values:: + + 0 2 3 4 7 8 9 53 + Similar to range segment types, RAJA provides ``RAJA::ListSegment``, which is a type alias to ``RAJA::TypedListSegment`` using ``RAJA::Index_type`` as the template type parameter. @@ -197,16 +206,17 @@ loop execution template to execute the indices defined by its segments:: // Run a kernel with iterates defined by the index set RAJA::forall(iset, [=] (int i) { ... }); -.. note:: Iterating over the indices of all segments in a RAJA index set - requires a two-level execution policy. The outer level specifies - how to iterate over the seqments. The inner level specifies how - each segment will execute. See :ref:`indexsetpolicy-label` for - more information about IndexSet execution policies. - In this example, the loop iterations will execute in three chunks defined by the two range segments and one list segment. The segments will be iterated over in parallel using OpenMP, and each segment will execute sequentially. +.. note:: Iterating over the indices of all segments in a RAJA index set + requires a two-level execution policy, with two template parameters, + as shown above. The first parameter specifies how to iterate over + the seqments. The second parameter specifies how each segment will + execute. See :ref:`indexsetpolicy-label` for more information about + RAJA index set execution policies. + .. note:: It is the responsibility of the user to ensure that segments are defined properly when using RAJA index sets. For example, if the same index appears in multiple segments, the corresponding loop diff --git a/docs/sphinx/user_guide/feature/local_array.rst b/docs/sphinx/user_guide/feature/local_array.rst index b6a2e55ef8..7207732b85 100644 --- a/docs/sphinx/user_guide/feature/local_array.rst +++ b/docs/sphinx/user_guide/feature/local_array.rst @@ -12,10 +12,10 @@ Local Array =========== -This section introduces RAJA local arrays. A ``RAJA::LocalArray`` is a -multi-dimensional array object whose memory is allocated when a RAJA kernel -is executed and only lives within the scope of the kernel execution. To -motivate the concept and usage, consider a simple C++ example +This section introduces RAJA *local arrays*. A ``RAJA::LocalArray`` is an +array object with one or more dimensions whose memory is allocated when a +RAJA kernel is executed and only lives within the scope of the kernel +execution. To motivate the concept and usage, consider a simple C++ example in which we construct and use two arrays in nested loops:: for(int k = 0; k < 7; ++k) { //k loop @@ -93,19 +93,20 @@ two-dimensional and one one-dimensional and creates an instance of each type. The template arguments for the ``RAJA::LocalArray`` types are: * Array data type - * Index permutation (see :ref:`view-label` for more on layouts and permutations) + * Index permutation (see :ref:`view-label` for more on RAJA permutations) * Array dimensions .. note:: ``RAJA::LocalArray`` types support arbitrary dimensions and sizes. The kernel policy is a two-level nested loop policy (see -:ref:`loop_elements-kernel-label`` for more information) with a statement type -``RAJA::statement::InitLocalMem`` inserted between the nested for-loops which -allocates the memory for the local arrays when the kernel executes. -The ``InitLocalMem`` statement type uses a 'CPU tile' memory type, for the -two entries '0' and '1' in the kernel parameter tuple (second argument to -``RAJA::kernel_param``). Then, the inner initialization loop and inner print -loops are run with the respective lambda bodies defined in the kernel. +:ref:`loop_elements-kernel-label` for information about RAJA kernel policies) +with a statement type ``RAJA::statement::InitLocalMem`` inserted between the +nested for-loops which allocates the memory for the local arrays when the +kernel executes. The ``InitLocalMem`` statement type uses a 'CPU tile' memory +type, for the two entries '0' and '1' in the kernel parameter tuple +(second argument to ``RAJA::kernel_param``). Then, the inner initialization +loop and inner print loop are run with the respective lambda bodies defined +in the kernel. ------------------- Memory Policies diff --git a/docs/sphinx/user_guide/feature/loop_basic.rst b/docs/sphinx/user_guide/feature/loop_basic.rst index 741a7d4d66..8d62c6bd4a 100644 --- a/docs/sphinx/user_guide/feature/loop_basic.rst +++ b/docs/sphinx/user_guide/feature/loop_basic.rst @@ -18,11 +18,12 @@ RAJA interface for loop execution. ``RAJA::forall`` methods execute simple loops (e.g., non-nested loops) while ``RAJA::kernel`` methods support nested loops and other complex loop kernels and transformations. -.. note:: * All **forall** and **kernel** methods are in the namespace ``RAJA``. +.. note:: * All ``forall`` and ``kernel`` methods are in the namespace ``RAJA``. * A ``RAJA::forall`` loop execution method is a template on an *execution policy* type. A ``RAJA::forall`` method takes two arguments: - * an iteration space object, and + * an iteration space object, such as a contiguous range of loop + indices, and * a lambda expression representing the loop body. * Each ``RAJA::kernel`` method is a template on a policy that contains statements with *execution policy* types appropriate for @@ -45,8 +46,8 @@ Simple Loops (RAJA::forall) --------------------------- As noted earlier, a ``RAJA::forall`` template executes simple -(e.g., non-nested) loops. For example, a C-style loop that adds two vectors, -like:: +(i.e., non-nested) loops. For example, a C-style loop that adds two vectors, +like this:: for (int i = 0; i < N; ++i) { c[i] = a[i] + b[i]; @@ -67,19 +68,20 @@ objects enable the loop iterates to be partitioned, reordered, run in different threads, etc. .. note:: Changing loop execution policy types and iteration space constructs - enable loops to run in different ways by recompiling the code and + enables loops to run in different ways by recompiling the code and without modifying the loop kernel code. While loop execution using ``RAJA::forall`` methods is a subset of ``RAJA::kernel`` functionality, described next, we maintain the ``RAJA::forall`` interface for simple loop execution because the syntax is -simpler and less verbose. +simpler and less verbose for that use case. .. note:: Data arrays in lambda expressions used with RAJA are typically RAJA Views (see :ref:`view-label`) or bare pointers as shown in the code snippets above. Using something like 'std::vector' is - non-portable (won't work in CUDA kernels) and would add excessive - overhead for copying data into the lambda data environment. + non-portable (won't work in GPU kernels, generally) and would add + excessive overhead for copying data into the lambda data environment + when captured by value. .. _loop_elements-kernel-label: @@ -99,7 +101,7 @@ consider a (N+1)-level C-style loop nest:: } Note that we could write this by nesting ``RAJA::forall`` statements and -it would work, assuming the execution policies were chosen properly:: +it would work for some execution policy choices:: RAJA::forall(IN, [=] (int iN) { ... @@ -111,19 +113,22 @@ it would work, assuming the execution policies were chosen properly:: However, this approach treats each loop level as an independent entity. This makes it difficult to parallelize the levels in the loop nest together. So it -limits the amount of parallelism that can be exposed and the types of +may limit the amount of parallelism that can be exposed and the types of parallelism that may be used. For example, if an OpenMP or CUDA parallel execution policy is used on the outermost loop, then all inner loops would be run sequentially in each thread. It also makes it difficult to perform -transformations like loop interchange and loop collapse. +transformations like loop interchange and loop collapse without changing the +source code, which breaks RAJA encapsulation. -The RAJA *kernel* interface facilitates parallel execution and transformations -of arbitrary loop nests and other complex loops. It can treat a complex loop -structure as a single entity, which simplifies the ability to apply kernel -transformations and different parallel execution patterns by changing one -execution policy type. +.. note:: **We do not recommend nesting ``RAJA::forall`` statements.** -The loop nest may be written using the RAJA kernel interface as:: +The RAJA *kernel* interface facilitates parallel execution and compile-time +transformation of arbitrary loop nests and other complex loop structures. +It can treat a complex loop structure as a single entity, which simplifies +the ability to transform and apply different parallel execution patterns by +changing the execution policy type and *not the kernel code*. + +The loop above nest may be written using the RAJA kernel interface as:: using KERNEL_POL = RAJA::KernelPolicy< RAJA::statement::For' symbols enclosing the template parameter lists. + One can think of the '<, >' symbols enclosing the template parameter + lists as being similar to the curly braces in C-style code. Here, the innermost type in the kernel policy is a ``RAJA::statement::Lambda<0>`` type indicating that the first lambda expression @@ -175,11 +180,15 @@ enables non-perfectly nested loops. RAJA offers two types of lambda statements. The first as illustratated above, requires that each lambda expression passed to a ``RAJA::kernel`` method **must take an index argument for each iteration space in the tuple**. -However, any subset of the arguments may actually be used in each lambda expression. +With this type of lambda statement, the entire iteration space must be active +in a containing ``For`` construct. A compile time ``static_assert`` will be +triggered if any of the arguments are undefined, indicating that something +is not correct. The second type of lambda statement, an extension of the first, takes additional -template parameters which are used to specify lambda arguments. This results in -kernel lambdas only requiring arguments which will be used within the body. +template parameters which specify which iteration space indices are passed +as lambda arguments. The result is that a kernel lambda only needs to accept +iteration space index arguments that are used in the lambda body. The kernel policy list with lambda arguments may be written as:: @@ -187,29 +196,30 @@ The kernel policy list with lambda arguments may be written as:: RAJA::KernelPolicy< RAJA::statement::For> + RAJA::statement::Lambda<0, RAJA::Segs> > ... > >; -The template parameter ``RAJA::statement::Segs`` is used to identify elements from the -segment tuple to be used as arguments for a lambda. RAJA offers other statements -such as ``Offsets``, and ``Params`` to identify offsets and parameters in segments and -param tuples respectively to be used as lambda argumentsx. See :ref:`matrixmultiply-label` -and :ref:`matrixtransposelocalarray-label` for detailed examples. - +The template parameter ``RAJA::Segs`` is used to specify which elements in the +segment tuple are used to pass arguments to a lambda. RAJA offers other +types such as ``RAJA::Offsets``, and ``RAJA::Params`` to identify offsets and +parameters in segments and param tuples respectively to be used as lambda +argumentsx. See :ref:`matrixmultiply-label` and +:ref:`matrixtransposelocalarray-label` for detailed examples. -.. note:: Unless lambda arguments are specified through RAJA lambda statements, +.. note:: Unless lambda arguments are specified in RAJA lambda statements, the loop index arguments for each lambda expression used in a RAJA kernel loop body **must match** the contents of the *iteration space tuple* in number, order, and type. Not all index - arguments must be used in each lambda, but they **all must appear** - for the RAJA kernel to be well-formed. In particular, your code will - not compile if this is not done correctly. If an argument is unused - in a lambda expression, you may include its type and omit its name - in the argument list to avoid compiler warnings just as one would do - for a regular C++ method. + arguments must be used in a lambda, but they **all must appear** + in the lambda argument list and **all must be in active loops** to be + well-formed. In particular, your code will not compile if this is + not done correctly. If an argument is unused in a lambda expression, + you may include its type and omit its name in the argument list to + avoid compiler warnings just as one would do for a regular C++ + method with unused arguments. For RAJA nested loops implemented with ``RAJA::kernel``, as shown here, the loop nest ordering is determined by the order of the nested policies, starting @@ -227,7 +237,11 @@ See :ref:`matmultkernel-label` for a complete example showing RAJA nested loop functionality and :ref:`nestedreorder-label` for a detailed example describing nested loop reordering. -A summary of all RAJA execution policies that may be used with ``RAJA::forall`` -or ``RAJA::kernel`` may be found in :ref:`policies-label`. Also, a discussion -of how to construct ``RAJA::KernelPolicy`` types and available -``RAJA::statement`` types can be found in :ref:`loop_elements-kernelpol-label`. +.. note:: In general, RAJA execution policies for ``RAJA::forall`` and + ``RAJA::kernel`` are different. A summary of all RAJA execution + policies that may be used with ``RAJA::forall`` or ``RAJA::kernel`` + may be found in :ref:`policies-label`. + +Finally, a discussion of how to construct ``RAJA::KernelPolicy`` types and +available ``RAJA::statement`` types can be found in +:ref:`loop_elements-kernelpol-label`. diff --git a/docs/sphinx/user_guide/feature/plugins.rst b/docs/sphinx/user_guide/feature/plugins.rst new file mode 100644 index 0000000000..5592ec83fe --- /dev/null +++ b/docs/sphinx/user_guide/feature/plugins.rst @@ -0,0 +1,129 @@ +.. ## +.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +.. ## and other RAJA project contributors. See the RAJA/COPYRIGHT file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _plugins-label: + +======== +Plugins +======== + +------------ +About RAJA Plugins +------------ + +RAJA supports user-made plugins that may be loaded either at the time of compilation or during runtime. These two methods are not mutually exclusive, as plugins loaded statically can be run alongside plugins that are loaded dynamically. + +------------ +Using RAJA Plugins +------------ + +^^^^^^^^^^^^^^^^^ +Static vs Dynamic Loading +^^^^^^^^^^^^^^^^^ + +**Static loading** is done at compile time and requires recompilation in order to add, remove, or change a plugin. This is arguably the easier method to implement, requiring only simple file linking to make work. However, recompilation may get tedious and resource-heavy when working with many plugins or on large projects. In these cases, it may be a better idea to load plugins dynamically, requiring no recompilation of the project most of the time. + +**Dynamic loading** is done at runtime and only requires the recompilation or moving of plugin files in order to add, remove, or change a plugin. This will likely require more work to set up, but in the long run may save time and resources. RAJA will look at the environment variable ``RAJA_PLUGINS`` for a path to a plugin or plugin directory, and automatically load them at runtime. This means that a plugin can be added to a project as easily as making a shared object file and setting ``RAJA_PLUGINS`` to the appropriate path. + +^^^^^^^^^^^ +Quick Start Guide +^^^^^^^^^^^ + +**Static** + +1. Build RAJA normally. + +2. Either use an ``#include`` statement within the code or compiler flags to load your plugin file with your project at compile time. A brief example of this would be something like ``g++ project.cpp plugin.cpp -lRAJA -fopenmp -ldl -o project``. + +3. When you run your project, your plugin should work! + +**Dynamic** + +1. Build RAJA normally. + +2. Compile your plugin to be shared object files with a .so extension. A brief example of this wouldbe something like ``g++ plugin.cpp -lRAJA -fopenmp -fPIC -shared -o plugin.so``. + +3. Set the environment variable ``RAJA_PLUGINS`` to be the path of your .so file. This can either be the path to its directory or to the shared object file itself. If the path is to a directory, it will attempt to load all .so files in that directory. + +4. When you run your project, your plugins should work! + +^^^^^^^^^^^ +Interfacing with Plugins +^^^^^^^^^^^ +The RAJA Plugin API allows for limited interfacing between a project and a plugin. There are, however a couple functions that allow for this to take place. ``init_plugins`` and ``finalize_plugins``. Using one of these will call the corresponding ``init`` or ``finalize`` function inside of *every* currently loaded plugin. It's worth noting that plugins don't require either an init or finalize function by default. + +* ``RAJA::util::init_plugins();`` - Will call the ``init`` function of every currently loaded plugin. + +* ``RAJA::util::init_plugins("path/to/plugins");`` - Does the same as the above call to init_plugins, but will also dynamically load plugins located at the path specified. + +* ``RAJA::util::finalize_plugins();`` - Will call the ``finalize`` function of every currently loaded plugin. + + +------------ +Creating Plugins For RAJA +------------ + +Plugins take advantage of *polymorphism*, using ``RAJA::util::PluginStrategy`` as the parent and implementing the required functions for the API. An example implementation can be found at the bottom of this page. + +^^^^^^^^^^^ +Functions +^^^^^^^^^^^ +The preLaunch and postLaunch functions are automatically called by RAJA before and after loop execution. This applies to RAJA's kernel and forall implementations. + +* ``void init(const PluginOptions& p) override {}`` - runs on all plugins when the user makes a call to ``init_plugins`` + +* ``void preCapture(const PluginContext& p) override {}`` - Will occur before capture of kernel/forall. + +* ``void postCapture(const PluginContext& p) override {}`` - Will occur after capture of kernel/forall. + +* ``void preLaunch(const PluginContext& p) override {}`` - Will occur before kernel/forall execution. + +* ``void postLaunch(const PluginContext& p) override {}`` - Will occur after kernel/forall execution. + +* ``void finalize() override {}`` - Runs on all plugins when the user makes a call to ``finalize_plugins``. This will also unload all currently loaded plugins. + +Init and finalize are never run by RAJA by default and are only run when the user makes a call to RAJA::util::init_plugin() or RAJA::util::finalize_plugin() respectively. + +^^^^^^^^^^^^^^^^^ +Static Loading +^^^^^^^^^^^^^^^^^ +If the plugin is to be loaded into a project at compile time, adding the following one-liner will add the plugin to the RAJA PluginRegistry and will be loaded every time the compiled executable is run. This requires the plugin to be loaded with either in an ``#include`` statement within the project or by compiler commands. +:: + + static RAJA::util::PluginRegistry::add P("Name", "Description"); + + +^^^^^^^^^^^^^^^^^ +Dynamic Loading +^^^^^^^^^^^^^^^^^ +If the plugin is to be dynamically loaded to a project during runtime, the RAJA Plugin API requires a few conditions to be met. The following must be true about the plugin, not necessarily of the project using it. + +1. **The plugin must have the following factory function.** This will return a pointer to an instance of your plugin. Thanks to the ``extern "C"`` a project will be able to search for "getPlugin" within the dynamically loaded plugin correctly. +:: + + extern "C" RAJA::util::PluginStrategy *getPlugin () + { + return new MyPluginName; + } + + +2. **The plugin must be compiled to be a shared object with a .so extension.** A simple example containing required flags would be: ``g++ plugin.cpp -lRAJA -fopenmp -fPIC -shared -o plugin.so``. At the moment, RAJA will only attempt to load files with .so extensions. It's worth noting why these flags (or their equivalents) are important. ``-lRAJA -fopenmp`` are the standard flags for compiling the RAJA library. For the purposes of dynamic loading, ``-fPIC`` tells the compiler to produce *position independent code*, which is needed to prevent conflicts in the address space of the executable. ``-shared`` will let the compiler know that you want the resulting object file to be shared, removing the need for a *main* as well as giving dynamically loaded executables access to functions flagged with ``extern "C"``. + +3. **The** ``RAJA_PLUGINS`` **environment variable has been set**, or the user has made a call to ``RAJA::util::init_plugins("path");`` with a path specified to either a directory or a .so file. It's worth noting that these are not mutually exclusive, RAJA will look for plugins from the environment variable on program startup and new plugins may be loaded after that using ``init_plugins``. + + +^^^^^^^^^^^^^^^^^ +Example Implementation +^^^^^^^^^^^^^^^^^ + +The following is an example plugin that simply will print out the number of times a kernel has been launched and has the ability to be loaded either statically or dynamically. + +.. literalinclude:: ../../../../examples/plugin/counter-plugin.cpp + :start-after: _plugin_example_start + :end-before: _plugin_example_end + :language: C++ diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index bbf0166f6b..cd75aa0904 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -13,7 +13,7 @@ Policies ================== This section describes various RAJA policies for loop kernel execution, -scans, reductions, atomics, etc. Each policy is a type that is passed to +scans, sorts, reductions, atomics, etc. Each policy is a type that is passed to a RAJA template method or class to specialize its behavior. Typically, the policy indicates which programming model back-end to use and sometimes provides additional information about the execution pattern, such as @@ -38,15 +38,16 @@ caveats. ====================================== ============= ========================== seq_exec forall, Strictly sequential kernel (For), execution - scan + scan, + sort simd_exec forall, Try to force generation of kernel (For), SIMD instructions via scan compiler hints in RAJA internal implementation loop_exec forall, Allow compiler to generate kernel (For), any optimizations, such as - scan SIMD, that may be - beneficial according to + scan, SIMD, that may be + sort beneficial according to its heuristics; i.e., no loop decorations (pragmas or intrinsics) in @@ -57,29 +58,47 @@ caveats. OpenMP CPU Multithreading Policies Works with Brief description ====================================== ============= ========================== omp_parallel_for_exec forall, Create OpenMP parallel - kernel (For), region and execute with - scan CPU multithreading inside - it; i.e., apply ``omp - parallel for`` pragma + kernel (For), region and execute with + scan, CPU multithreading inside + sort it; i.e., apply ``omp + parallel for`` pragma omp_for_exec forall, Parallel execution with - kernel (For) OpenMP CPU multithreading - inside an *existing* - parallel region; i.e., + kernel (For), OpenMP CPU multithreading + scan inside an *existing* + parallel region (see + comments below); i.e., apply ``omp for`` pragma omp_for_static forall, Execute loop with OpenMP - kernel (For) CPU multithreading using - static schedule and given + kernel (For), CPU multithreading using + scan static schedule and given chunk size inside an *existing* parallel - region; i.e., apply ``omp for schedule(static, + region (see comments + below); i.e., apply ``omp for schedule(static, CHUNK_SIZE)`` pragma omp_for_nowait_exec forall, Parallel execution with - kernel (For) OpenMP CPU multithreading - inside an existing - parallel region without + kernel (For), OpenMP CPU multithreading + scan inside an *existing* + parallel region (see + comments below) without synchronization after loop; i.e., apply ``omp for nowait`` pragma + omp_for_schedule_exec forall, Parallel execution with + kernel (For) OpenMP CPU multithreading + inside an *existing* + parallel region (see + comments below) with a + specified schedule (*Sched*) + omp_for_nowait_schedule_exec forall, Parallel execution with + kernel (For) OpenMP CPU multithreading + inside an *existing* + parallel region (see + comments below) with a + specified schedule (*Sched*) + and without synchronization + after loop; e.g., append + ``nowait`` to pragma ====================================== ============= ========================== ====================================== ============= ========================== @@ -87,22 +106,22 @@ caveats. ====================================== ============= ========================== tbb_for_exec forall, Execute loop iterations kernel (For), as tasks in parallel using - scan TBB ``parallel_for`` + scan TBB ``parallel_for`` method tbb_for_static forall, Same as above, but use kernel (For), a static scheduler with scan given chunk size tbb_for_dynamic forall, Same as above, but use kernel (For), a dynamic scheduler - scan + scan ====================================== ============= ========================== ====================================== ============= ========================== CUDA Execution Policies Works with Brief description ====================================== ============= ========================== cuda_exec forall, Execute loop iterations - kernel (For), in a CUDA kernel launched - scan with given thread-block + scan, in a CUDA kernel launched + sort with given thread-block size. If block size not given, the default value of 256 threads/block is @@ -110,7 +129,7 @@ caveats. cuda_thread_x_direct kernel (For) Map loop iterates directly to CUDA threads in x-dimension, one - iterate per thread + iterate per thread (see note below about limitations) cuda_thread_y_direct kernel (For) Same as above, but map @@ -118,19 +137,19 @@ caveats. cuda_thread_z_direct kernel (For) Same as above, but map to threads in z-dimension cuda_thread_x_loop kernel (For) Similar to thread-x-direct - policy, but use a + policy, but use a block-stride loop which - doesn't limit number of + doesn't limit number of loop iterates cuda_thread_y_loop kernel (For) Same as above, but for threads in y-dimension cuda_thread_z_loop kernel (For) Same as above, but for threads in z-dimension - cuda_block_x_direct kernel (For) Map loop iterates - directly to CUDA thread + cuda_block_x_direct kernel (For) Map loop iterates + directly to CUDA thread blocks in x-dimension, one iterate per block - cuda_block_y_direct kernel (For) Same as above, but map + cuda_block_y_direct kernel (For) Same as above, but map to blocks in y-dimension cuda_block_z_direct kernel (For) Same as above, but map to blocks in z-dimension @@ -143,15 +162,15 @@ caveats. blocks in y-dimension cuda_block_z_loop kernel (For) Same as above, but use blocks in z-dimension - cuda_warp_direct kernel (For) Map work to threads + cuda_warp_direct kernel (For) Map work to threads in a warp directly. Cannot be used in conjunction with cuda_thread_x_* policies. Multiple warps can be created by using - cuda_thread_y/z_* - policies. + cuda_thread_y/z_* + policies. cuda_warp_loop kernel (For) Policy to map work to threads in a warp using a warp-stride loop. @@ -161,9 +180,9 @@ caveats. Multiple warps can be created by using cuda_thread_y/z_* - policies. - cuda_warp_mask_direct> kernel (For) Policy to map work - directly to threads in a + policies. + cuda_warp_mask_direct> kernel (For) Policy to map work + directly to threads in a warp using a bit mask. Cannot be used in conjunction with @@ -194,25 +213,25 @@ caveats. ====================================== ============= ========================== OpenMP Target Execution Policies Works with Brief description ====================================== ============= ========================== - omp_target_parallel_for_exec<#> forall Create parallel target - region and execute with - given number of threads + omp_target_parallel_for_exec<#> forall Create parallel target + region and execute with + given number of threads per team inside it. Number of teams is calculated internally; i.e., - apply ``omp teams - distribute parallel for + apply ``omp teams + distribute parallel for num_teams(iteration space size/#) thread_limit(#)`` pragma - omp_target_parallel_collapse_exec kernel Similar to above, but - (Collapse) collapse + omp_target_parallel_collapse_exec kernel Similar to above, but + (Collapse) collapse *perfectly-nested* - loops, indicated in + loops, indicated in arguments to RAJA Collapse statement. Note: compiler determines number - of thread teams and + of thread teams and threads per team ====================================== ============= ========================== @@ -220,10 +239,53 @@ The following notes provide additional information about policy usage. .. note:: To control the number of threads used by OpenMP policies set the value of the environment variable 'OMP_NUM_THREADS' (which is - fixed for duration of run), or call the OpenMP routine - 'omp_set_num_threads(nthreads)' (which allows changing number of + fixed for duration of run), or call the OpenMP routine + 'omp_set_num_threads(nthreads)' (which allows changing number of threads at runtime). +.. note:: As noted above, some OpenMP policies must only be used within an + **existing** parallel region to work the way you would expect them + to. For example:: + + RAJA::region([=]() { + + RAJA::forall(segment, [=] (int idx) { + // do something at iterate 'idx' + }); + + RAJA::forall(segment, [=] (int idx) { + // do something else at iterate 'idx' + }); + + }); + + Here, the ``RAJA::region`` method call + creates an OpenMP parallel region, which contains two ``RAJA::forall`` + kernels. The first uses the ``RAJA::omp_for_nowait_exec`` policy, + meaning that no thread synchronization is needed after the kernel. + Thus, threads can start working on the second kernel while others + are still working on the first kernel. I general, this can only be + guaranteed to be correct if the segments used in the two kernels + are the same and each loop is data parallel. The second kernel uses + the ``RAJA::omp_for_exec`` policy, which means that all threads will + complete before the kernel exits. In this example, this is not + really needed since there is no more code to execute in the parallel + region and there is an implicit barrier at the end of it. + +.. note:: As noted above, a *Scheduling Policy* can be specified for + ``omp_for_schedule_exec`` and ``omp_for_nowait_schedule_exec`` policies. + All possible schedules reside under the ``RAJA::policy::omp`` namespace + + * ``Static`` equivilent to ``schedule(static, ChunkSize)`` + * ``Dynamic`` equivilent to ``schedule(dynamic, ChunkSize)`` + * ``Guided`` equivilent to ``schedule(guided, ChunkSize)`` + * ``Runtime`` equivilent to ``schedule(runtime)`` + * ``Auto`` equivilent to no schedule specified + + There is a special identifier ``RAJA::policy::omp::default_chunk_size`` + which can be used as the template argument to ``Static``, ``Dynamic``, + or ``Guided`` to defer to the implementation-defined default chunk size. + .. note:: To control the number of TBB worker threads used by these policies: set the value of the environment variable 'TBB_NUM_WORKERS' (which is fixed for duration of run), or create a 'task_scheduler_init' object:: @@ -241,27 +303,27 @@ The following notes provide additional information about policy usage. Several notable constraints apply to RAJA CUDA *thread-direct* policies. -.. note:: * Repeating thread direct policies with the same thread dimension - in perfectly nested loops is not recommended. Your code may do +.. note:: * Repeating thread direct policies with the same thread dimension + in perfectly nested loops is not recommended. Your code may do something, but likely will not do what you expect and/or be correct. - * If multiple thread direct policies are used in a kernel (using - different thread dimensions), the product of sizes of the - corresponding iteration spaces cannot be greater than the - maximum allowable threads per block. Typically, this is - equ:math:`\leq` 1024; i.e., attempting to launch a CUDA kernel - with more than 1024 threads per block will cause the CUDA runtime - to complain about *illegal launch parameters.* - * **Thread-direct policies are recommended only for certain loop + * If multiple thread direct policies are used in a kernel (using + different thread dimensions), the product of sizes of the + corresponding iteration spaces cannot be greater than the + maximum allowable threads per block. Typically, this is + equ:math:`\leq` 1024; i.e., attempting to launch a CUDA kernel + with more than 1024 threads per block will cause the CUDA runtime + to complain about *illegal launch parameters.* + * **Thread-direct policies are recommended only for certain loop patterns, such as tiling.** -Several notes regarding CUDA thread and block *loop* policies are also good to +Several notes regarding CUDA thread and block *loop* policies are also good to know. -.. note:: * There is no constraint on the product of sizes of the associated +.. note:: * There is no constraint on the product of sizes of the associated loop iteration space. - * These polices allow having a larger number of iterates than + * These polices allow having a larger number of iterates than threads in the x, y, or z thread dimension. - * **Cuda thread and block loop policies are recommended for most + * **Cuda thread and block loop policies are recommended for most loop patterns.** Finally @@ -296,18 +358,18 @@ available to use for the segment iteration policy: Execution Policy Brief description ====================================== ========================================= **Serial** -seq_segit Iterate over index set segments +seq_segit Iterate over index set segments sequentially -**OpenMP CPU multithreading** -omp_parallel_segit Create OpenMP parallel region and - iterate over segments in parallel inside it; i.e., apply ``omp parallel for`` +**OpenMP CPU multithreading** +omp_parallel_segit Create OpenMP parallel region and + iterate over segments in parallel inside it; i.e., apply ``omp parallel for`` pragma on loop over segments omp_parallel_for_segit Same as above **Intel Threading Building Blocks** -tbb_segit Iterate over index set segments in - parallel using a TBB 'parallel_for' +tbb_segit Iterate over index set segments in + parallel using a TBB 'parallel_for' method ====================================== ========================================= @@ -315,14 +377,14 @@ tbb_segit Iterate over index set segments in Parallel Region Policies ------------------------- -The following policies may only be used with the ``RAJA::region`` method. +The following policies may only be used with the ``RAJA::region`` method. ``RAJA::forall`` and ``RAJA::kernel`` methods may be used within a parallel region created with the ``RAJA::region`` construct. * ``seq_region`` - Create a sequential region (see note below). * ``omp_parallel_region`` - Create an OpenMP parallel region. -For example, the following code will execute two consecutive loops in parallel +For example, the following code will execute two consecutive loops in parallel in an OpenMP parallel region without synchronizing threads between them:: RAJA::region( [=]() { @@ -340,9 +402,9 @@ in an OpenMP parallel region without synchronizing threads between them:: }); // end omp parallel region .. note:: The sequential region specialization is essentially a *pass through* - operation. It is provided so that if you want to turn off OpenMP in - your code, you can simply replace the region policy type and you do - not have to change your algorithm source code. + operation. It is provided so that if you want to turn off OpenMP in + your code, you can simply replace the region policy type and you do + not have to change your algorithm source code. .. _reducepolicy-label: @@ -367,7 +429,7 @@ Reduction Policy Loop Policies Brief description to Use With ===================== ============= =========================================== seq_reduce seq_exec, Non-parallel (sequential) reduction - loop_exec + loop_exec omp_reduce any OpenMP OpenMP parallel reduction policy omp_reduce_ordered any OpenMP OpenMP parallel reduction with result @@ -377,7 +439,7 @@ omp_target_reduce any OpenMP OpenMP parallel target offload reduction tbb_reduce any TBB TBB parallel reduction policy cuda_reduce any CUDA Parallel reduction in a CUDA kernel - policy (device synchronization will occur when + policy (device synchronization will occur when reduction value is finalized) cuda_reduce_atomic any CUDA Same as above, but reduction may use CUDA policy atomic operations @@ -395,7 +457,7 @@ Atomic Policies Each RAJA atomic operation must be defined with an 'atomic policy' type. Atomic policy types are distinct from loop execution policy types. -.. note :: An atomic policy type must be consistent with the loop execution +.. note :: An atomic policy type must be consistent with the loop execution policy for the kernel in which the atomic operation is used. The following table summarizes RAJA atomic policies and usage. @@ -405,21 +467,21 @@ Atomic Policy Loop Policies Brief description ===================== ============= =========================================== seq_atomic seq_exec, Atomic operation performed in a non-parallel loop_exec (sequential) kernel -omp_atomic any OpenMP Atomic operation performed in an OpenMP - policy multithreading or target kernel; i.e., +omp_atomic any OpenMP Atomic operation performed in an OpenMP + policy multithreading or target kernel; i.e., apply ``omp atomic`` pragma cuda_atomic any CUDA Atomic operation performed in a CUDA kernel - policy + policy builtin_atomic seq_exec, Compiler *builtin* atomic operation loop_exec, any OpenMP - policy + policy auto_atomic seq_exec, Atomic operation *compatible* with loop loop_exec, execution policy. See example below. any OpenMP policy, any CUDA - policy + policy ===================== ============= =========================================== Here is an example illustrating use of the ``auto_atomic`` policy:: @@ -432,13 +494,13 @@ Here is an example illustrating use of the ``auto_atomic`` policy:: }); In this case, the atomic operation knows that it is used in a CUDA kernel -context and the CUDA atomic operation is applied. Similarly, if an OpenMP -execution policy was used, the OpenMP version of the atomic operation would +context and the CUDA atomic operation is applied. Similarly, if an OpenMP +execution policy was used, the OpenMP version of the atomic operation would be used. .. note:: * There are no RAJA atomic policies for TBB (Intel Threading Building Blocks) execution contexts at present. - * The ``builtin_atomic`` policy may be preferable to the + * The ``builtin_atomic`` policy may be preferable to the ``omp_atomic`` policy in terms of performance. .. _localarraypolicy-label: @@ -465,13 +527,13 @@ for ``RAJA::LocalArray`` objects: RAJA Kernel Execution Policies -------------------------------- -RAJA kernel execution policy constructs form a simple domain specific language -for composing and transforming complex loops that relies -**solely on standard C++11 template support**. +RAJA kernel execution policy constructs form a simple domain specific language +for composing and transforming complex loops that relies +**solely on standard C++11 template support**. RAJA kernel policies are constructed using a combination of *Statements* and -*Statement Lists*. A RAJA Statement is an action, such as execute a loop, -invoke a lambda, set a thread barrier, etc. A StatementList is an ordered list -of Statements that are composed in the order that they appear in the kernel +*Statement Lists*. A RAJA Statement is an action, such as execute a loop, +invoke a lambda, set a thread barrier, etc. A StatementList is an ordered list +of Statements that are composed in the order that they appear in the kernel policy to construct a kernel. A Statement may contain an enclosed StatmentList. Thus, a ``RAJA::KernelPolicy`` type is really just a StatementList. The main Statement types provided by RAJA are ``RAJA::statement::For`` and @@ -482,10 +544,10 @@ position of the item it applies to in the iteration space tuple argument to the ``RAJA::kernel`` method. The ExecPolicy is the RAJA execution policy to use on that loop/iteration space (similar to ``RAJA::forall``). EnclosedStatements contain whatever is nested within the template parameter -list to form a StatementList, which will be executed for each iteration of -the loop. The ``RAJA::statement::Lambda`` invokes the lambda -corresponding to its position (LambdaID) in the sequence of lambda expressions -in the ``RAJA::kernel`` argument list. For example, a simple sequential +list to form a StatementList, which will be executed for each iteration of +the loop. The ``RAJA::statement::Lambda`` invokes the lambda +corresponding to its position (LambdaID) in the sequence of lambda expressions +in the ``RAJA::kernel`` argument list. For example, a simple sequential for-loop:: for (int i = 0; i < N; ++i) { @@ -508,17 +570,17 @@ can be represented using the RAJA kernel interface as:: } ); -.. note:: All ``RAJA::forall`` functionality can be done using the +.. note:: All ``RAJA::forall`` functionality can be done using the ``RAJA::kernel`` interface. We maintain the ``RAJA::forall`` interface since it is less verbose and thus more convenient for users. - + RAJA::kernel Statement Types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The list below summarizes the current collection of statement types that can be used with ``RAJA::kernel`` and ``RAJA::kernel_param``. More detailed -explanation along with examples of how they are used can be found in +explanation along with examples of how they are used can be found in :ref:`tutorial-label`. .. note:: * All of these statement types are in the namespace ``RAJA``. @@ -531,12 +593,6 @@ explanation along with examples of how they are used can be found in * ``statement::Lambda< LambdaId, Args...>`` extension of the lambda statement; enabling lambda arguments to be specified at compile time. - * ``statement::Segs<...>`` argument to a Lambda statement; used to specify which segments in a tuple will be used as lambda arguments. - - * ``statement::Offsets<...>`` argument to a Lambda statement; used to specify which segment offsets in a tuple will be used as lambda arguments. - - * ``statement::Params<...>`` argument to a Lambda statement; used to specify which params in a tuple will be used as lambda arguments. - * ``statement::Collapse< ExecPolicy, ArgList<...>, EnclosedStatements >`` collapses multiple perfectly nested loops specified by tuple iteration space indices in 'ArgList', using the 'ExecPolicy' execution policy, and places 'EnclosedStatements' inside the collapsed loops which are executed for each iteration. Note that this only works for CPU execution policies (e.g., sequential, OpenMP).It may be available for CUDA in the future if such use cases arise. * ``statement::CudaKernel< EnclosedStatements>`` launches 'EnclosedStatements' as a CUDA kernel; e.g., a loop nest where the iteration spaces of each loop level are associated with threads and/or thread blocks as described by the execution policies applied to them. This kernel launch is synchronous. @@ -550,7 +606,7 @@ explanation along with examples of how they are used can be found in * ``statement::CudaKernelOcc`` similar to CudaKernel but uses the CUDA occupancy calculator to determine the optimal number of threads/blocks. Statement is intended for RAJA::cuda_block_{xyz}_loop policies. This kernel launch is synchronous. * ``statement::CudaKernelOccAsync`` asynchronous version of CudaKernelOcc. - + * ``statement::CudaKernelExp`` similar to CudaKernelOcc but with the flexibility to fix the number of threads and/or blocks and let the CUDA occupancy calculator determine the unspecified values. This kernel launch is synchronous. * ``statement::CudaKernelExpAsync`` asynchronous version of CudaKernelExp. @@ -567,15 +623,30 @@ explanation along with examples of how they are used can be found in * ``statement::TileTCount< ArgId, ParamId, TilePolicy, ExecPolicy, EnclosedStatements >`` abstracts an outer tiling loop containing an inner for-loop over each tile, **where it is necessary to obtain the tile number in each tile**. The 'ArgId' indicates which entry in the iteration space tuple to which the loop applies and the 'ParamId' indicates the position of the tile number in the parameter tuple. The 'TilePolicy' specifies the tiling pattern to use, including its dimension. The 'ExecPolicy' and 'EnclosedStatements' are similar to what they represent in a ``statement::For`` type. - * ``statement::tile_fixed`` partitions loop iterations into tiles of a fixed size specified by 'TileSize'. This statement type can be used as the 'TilePolicy' template paramter in the Tile statements above. - * ``statement::ForICount< ArgId, ParamId, ExecPolicy, EnclosedStatements >`` abstracts an inner for-loop within an outer tiling loop **where it is necessary to obtain the local iteration index in each tile**. The 'ArgId' indicates which entry in the iteration space tuple to which the loop applies and the 'ParamId' indicates the position of the tile index parameter in the parameter tuple. The 'ExecPolicy' and 'EnclosedStatements' are similar to what they represent in a ``statement::For`` type. - * ``RAJA::statement::Reduce< ReducePolicy, Operator, ParamId, EnclosedStatements >`` reduces a value across threads to a single thread. The 'ReducePolicy' is similar to what it represents for RAJA reduction types. 'ParamId' specifies the position of the reduction value in the parameter tuple passed to the ``RAJA::kernel_param`` method. 'Operator' is the binary operator used in the reduction; typically, this will be one of the operators that can be used with RAJA scans (see :ref:`scanops-label`. After the reduction is complete, the 'EnclosedStatements' execute on the thread that received the final reduced value. + * ``statement::Reduce< ReducePolicy, Operator, ParamId, EnclosedStatements >`` reduces a value across threads to a single thread. The 'ReducePolicy' is similar to what it represents for RAJA reduction types. 'ParamId' specifies the position of the reduction value in the parameter tuple passed to the ``RAJA::kernel_param`` method. 'Operator' is the binary operator used in the reduction; typically, this will be one of the operators that can be used with RAJA scans (see :ref:`scanops-label`. After the reduction is complete, the 'EnclosedStatements' execute on the thread that received the final reduced value. * ``statement::If< Conditional >`` chooses which portions of a policy to run based on run-time evaluation of conditional statement; e.g., true or false, equal to some value, etc. * ``statement::Hyperplane< ArgId, HpExecPolicy, ArgList<...>, ExecPolicy, EnclosedStatements >`` provides a hyperplane (or wavefront) iteration pattern over multiple indices. A hyperplane is a set of multi-dimensional index values: i0, i1, ... such that h = i0 + i1 + ... for a given h. Here, 'ArgId' is the position of the loop argument we will iterate on (defines the order of hyperplanes), 'HpExecPolicy' is the execution policy used to iterate over the iteration space specified by ArgId (often sequential), 'ArgList' is a list of other indices that along with ArgId define a hyperplane, and 'ExecPolicy' is the execution policy that applies to the loops in ArgList. Then, for each iteration, everything in the 'EnclosedStatements' is executed. + +The following list summarizes auxillary types used in the above statments. These +types live in the ``RAJA`` namespace. + + * ``tile_fixed`` tile policy argument to a ``Tile`` or ``TileTCount`` statement; partitions loop iterations into tiles of a fixed size specified by 'TileSize'. This statement type can be used as the 'TilePolicy' template paramter in the ``Tile`` statements above. + + * ``tile_dynamic`` TilePolicy argument to a Tile or TileTCount statement; partitions loop iterations into tiles of a size specified by a ``TileSize{}`` positional parameter argument. This statement type can be used as the 'TilePolicy' template paramter in the ``Tile`` statements above. + + * ``Segs<...>`` argument to a Lambda statement; used to specify which segments in a tuple will be used as lambda arguments. + + * ``Offsets<...>`` argument to a Lambda statement; used to specify which segment offsets in a tuple will be used as lambda arguments. + + * ``Params<...>`` argument to a Lambda statement; used to specify which params in a tuple will be used as lambda arguments. + + * ``ValuesT`` argument to a Lambda statement; used to specify compile time constants, of type T, that will be used as lambda arguments. + + Examples that show how to use a variety of these statement types can be found in :ref:`tutorialcomplex-label`. diff --git a/docs/sphinx/user_guide/feature/reduction.rst b/docs/sphinx/user_guide/feature/reduction.rst index 090858b4a2..d3b6112c23 100644 --- a/docs/sphinx/user_guide/feature/reduction.rst +++ b/docs/sphinx/user_guide/feature/reduction.rst @@ -13,9 +13,9 @@ Reduction Operations ==================== RAJA does not provide separate loop execution methods for loops containing -reduction operations like some other C++ loop programming abstraction models do. +reduction operations like some other C++ loop programming abstraction models. Instead, RAJA provides reduction types that allow users to perform reduction -operations in ``RAJA::forall`` and ``RAJA::kernel`` methods in a portable, +operations in ``RAJA::forall`` and ``RAJA::kernel`` kernels in a portable, thread-safe manner. Users may use as many reduction objects in a loop kernel as they need. Available RAJA reduction types are described in this section. @@ -27,11 +27,14 @@ A detailed example of RAJA reduction usage can be found in Also .. note:: * Each RAJA reduction type is templated on a **reduction policy** - and a **reduction value type** for the reduction variable. - * Each RAJA reduction type accepts an **initial reduction value** at - construction. - * Each RAJA reduction type has a 'get' method to access its reduced - value after kernel execution completes. + and a **reduction value type** for the reduction variable. The + **reduction policy type must be compatibe with the execution + policy used by the kernel.** For example, in a CUDA kernel, + a CUDA reduction policy must be used. + * Each RAJA reduction type accepts an **initial reduction value or + values** at construction (see below). + * Each RAJA reduction type has a 'get' method to access reduced + values after kernel execution completes. ---------------- @@ -50,13 +53,28 @@ RAJA supports five common reduction types: * ``ReduceMaxLoc< reduce_policy, data_type >`` - Max value and a loop index where the maximum was found. +and two less common bitwise reduction types: + +* ``ReduceBitAnd< reduce_policy, data_type >`` - Bitwise 'and' of values (i.e., ``a & b``). + +* ``ReduceBitOr< reduce_policy, data_type >`` - Bitwise 'or' of values (i.e., ``a | b``). + .. note:: * When ``RAJA::ReduceMinLoc`` and ``RAJA::ReduceMaxLoc`` are used in a sequential execution context, the loop index of the min/max is the first index where the min/max occurs. - * When the 'loc' reductions are used in a parallel execution context, - the loop index given for the reduction value may be any index + * When these reductions are used in a parallel execution context, + the loop index computed for the reduction value may be any index where the min or max occurs. +.. note:: ``RAJA::ReduceBitAnd`` and ``RAJA::ReduceBitOr`` reduction types are designed to work on integral data types because **in C++, at the language level, there is no such thing as a bitwise operator on floating-point numbers.** + +------------------- +Reduction Examples +------------------- + +Next, we provide a few examples to illustrate basic usage of RAJA reduction +types. + Here is a simple RAJA reduction example that shows how to use a sum reduction type and a min-loc reduction type:: @@ -64,15 +82,19 @@ type and a min-loc reduction type:: // // Initialize array of length N with all ones. Then, set some other - // values to make the example mildly interesting... + // values in the array to make the example mildly interesting... // int vec[N] = {1}; vec[100] = -10; vec[500] = -10; - // Create sum and min-loc reduction objects with initial values + // Create a sum reduction object with initial value of zero RAJA::ReduceSum< RAJA::omp_reduce, int > vsum(0); + + // Create a min-loc reduction object with initial min value of 100 + // and initial location index value of -1 RAJA::ReduceMinLoc< RAJA::omp_reduce, int > vminloc(100, -1); + // Run a kernel using the reduction objects RAJA::forall( RAJA::RangeSegment(0, N), [=](RAJA::Index_type i) { @@ -81,6 +103,7 @@ type and a min-loc reduction type:: }); + // After kernel is run, extract the reduced values int my_vsum = static_cast(vsum.get()); int my_vmin = static_cast(vminloc.get()); @@ -94,7 +117,37 @@ The results of these operations will yield the following values: Note that the location index for the minimum array value can be one of two values depending on the order of the reduction finalization since the loop -is run in parallel. +is run in parallel. Also, note that the reduction objects are created using +a ``RAJA::omp_reduce`` reduction policy, which is compatible with the +OpenMP execution policy used in the kernel. + +Here is an example of a bitwise or reduction:: + + const int N = 100; + + // + // Initialize all entries in array of length N to the value '9' + // + int vec[N] = {9}; + + // Create a bitwise or reduction object with initial value of '5' + RAJA::ReduceBitOr< RAJA::omp_reduce, int > my_or(5); + + // Run a kernel using the reduction object + RAJA::forall( RAJA::RangeSegment(0, N), + [=](RAJA::Index_type i) { + + my_or |= vec[i]; + + }); + + // After kernel is run, extract the reduced value + int my_or_reduce_val = static_cast(my_or.get()); + +The result of the reduction is the value '13'. In binary representation +(i.e., bits), :math:`9 = ...01001` (the vector entries) and +:math:`5 = ...00101` (the initial reduction value). +So :math:`9 | 5 = ...01001 | ...00101 = ...01101 = 13`. ------------------- Reduction Policies diff --git a/docs/sphinx/user_guide/feature/resource.rst b/docs/sphinx/user_guide/feature/resource.rst new file mode 100644 index 0000000000..1daf169299 --- /dev/null +++ b/docs/sphinx/user_guide/feature/resource.rst @@ -0,0 +1,286 @@ +.. ## +.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +.. ## and other RAJA project contributors. See the RAJA/COPYRIGHT file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _resource-label: + +========= +Resources +========= + +This section describes the basic concepts of Resource types and their +functionality in ``RAJA::forall``. Resources are used as an interface to +various backend constructs and their respective hardware. Currently there +exists Resource types for ``Cuda``, ``Hip``, ``Omp`` (target) and ``Host``. +Resource objects allow the user to execute ``RAJA::forall`` calls +asynchronously on a respective thread/stream. The underlying concept of each +individual Resource is still under development and it should be considered +that functionality / behaviour may change. + +.. note:: * Currently feature complete asynchronous behavior and + streamed/threaded support is available only for ``Cuda`` and + ``Hip`` resources. + * The ``RAJA::resources`` namespace aliases the ``camp::resources`` + namespace. + +Each resource has a set of underlying functionality that is synonymous across +all resource types. + + ===================== =============================================== + Methods Brief description + ===================== =============================================== + get_platform Returns the underlying camp platform + the resource is associated with. + get_event Return an Event object for the resource from + the last resource call. + allocate Allocate data per the resource's given + backend. + deallocate Deallocate data per the resource's given + backend. + memcpy Perform a memory copy from a src location + to a destination location from the + resource's backend. + memset Set memory value per the resourse's + given backend. + wait_for Enqueue a wait on the resource's stream/thread + for a user passed event to occur. + ===================== =============================================== + +.. note:: ``deallocate``, ``memcpy`` and ``memset`` will only work with + pointers that correspond to memory locations that have been + allocated on the resource's respective device. + +Each resource type also defines specific backend information/functionality. +For example, each CUDA resource contains a ``cudaStream_t`` value with an +associated get method. See the individual functionality for each resource +in ``raja/tpl/camp/include/resource/``. + +.. note:: Stream IDs are assigned to resources in a round robin fashion. The + number of independent streams for a given backend is limited to the + maximum number of concurrent streams that the back-end supports. + +------------ +Type-Erasure +------------ + +Resources can be declared in two formats: An erased resource, and a concrete +resource. The underlying runtime functionality is the same for both formats. +An erased resource allows a user the ability to change the resource backend +at runtime. + +Concrete CUDA resource:: + + RAJA::resources::Cuda my_cuda_res; + +Erased resource:: + + if (use_gpu) + RAJA::resources::Resource my_res{RAJA::resources::Cuda()}; + else + RAJA::resources::Resource my_res{RAJA::resources::Host()}; + + +Memory allocation on resources:: + + int* a1 = my_cuda_res.allocate(ARRAY_SIZE); + int* a2 = my_res.allocate(ARRAY_SIZE); + +If ``use_gpu`` is ``true``, then the underlying type of ``my_res`` is a CUDA +resource. Therefore ``a1`` and ``a2`` will both be allocated on the GPU. If +``use_gpu`` is ``false``, then only ``a1`` is allocated on the GPU, and +``a2`` is allocated on the host. + + +------ +Forall +------ + +A resource is an optional argument to a ``RAJA::forall`` call. When used, +it is passed as the first argument to the method:: + + RAJA::forall(my_gpu_res, .... ) + +When specifying a CUDA or HIP resource, the ``RAJA::forall`` is executed +aynchronously on a stream. Currently, CUDA and HIP are the only Resources +that enable asynchronous threading with a ``RAJA::forall``. All other calls +default to using the ``Host`` resource until further support is added. + +The Resource type that is passed to a ``RAJA::forall`` call must be a concrete +type. This is to allow for a compile-time assertion that the resource is not +compatible with the given execution policy. For example:: + + using ExecPol = RAJA::cuda_exec_async; + RAJA::resources::Cuda my_cuda_res; + RAJA::resources::Resource my_res{RAJA::resources::Cuda()}; + RAJA::resources::Host my_host_res; + + RAJA::forall(my_cuda_res, .... ) // Compiles. + RAJA::forall(my_res, .... ) // Compilation Error. Not Concrete. + RAJA::forall(my_host_res, .... ) // Compilation Error. Mismatched Resource and Exec Policy. + +Below is a list of the currently available concrete resource types and their +execution policy suport. + + ======== ============================== + Resource Policies supported + ======== ============================== + Cuda | cuda_exec + | cuda_exec_async + Hip | hip_exec + | hip_exec_async + Omp* | omp_target_parallel_for_exec + | omp_target_parallel_for_exec_n + Host | loop_exec + | seq_exec + | openmp_parallel_exec + | omp_for_schedule_exec + | omp_for_nowait_schedule_exec + | simd_exec + | tbb_for_dynamic + | tbb_for_static + ======== ============================== + +.. note:: The ``RAJA::resources::Omp`` resource is still under development. + +IndexSet policies require two execution policies (see :ref:`indexsets-label`). +Currently, users may only pass a single resource to a forall method taking +an IndexSet argument. This resource is used for the inner execution of +each Segment in the IndexSet:: + + using ExecPol = RAJA::ExecPolicy>; + RAJA::forall(my_cuda_res, iset, .... ); + + +When a resource is not provided by the user, a *default* resource is assigned, +which can be accessed in a number of ways. It can be accessed directly from +the concrete resource type:: + + RAJA::resources::Cuda my_default_cuda = RAJA::resources::Cuda::get_default(); + +The resource type can also be deduced from an execution policy:: + + using Res = RAJA::resources::get_resource::type; + Res r = Res::get_default(); + +Finally, the resource type can be deduced from an execution policy:: + + auto my_resource = RAJA::resources::get_default_resource(); + +.. note:: For CUDA and HIP, the default resource is *NOT* the CUDA or HIP + default stream. It is its own stream defined in + ``camp/include/resource/``. This is an attempt to break away + from some of the issues that arise from the synchronization behaviour + of the CUDA and HIP default streams. It is still possible to use the + CUDA and HIP default streams as the default resource. This can be + enabled by defining the environment variable + ``CAMP_USE_PLATFORM_DEFAULT_STREAM`` before compiling RAJA in a + project. + +------ +Events +------ + +Event objects allow users to wait or query the status of a resource's action. An +event can be returned from a resource:: + + RAJA::resources::Event e = my_res.get_event(); + +Getting an event like this enqueues an event object for the given back-end. + +Users can call the *blocking* ``wait`` function on the event:: + + e.wait(); + +Preferably, users can enqueue the event on a specific resource, forcing only +that resource to wait for the event:: + + my_res.wait_for(&e); + +The usage allows one to set up dependencies between resource objects and +``RAJA::forall`` calls. + +.. note:: An Event object is only created if a user explicitly sets the event + returned by the ``RAJA::forall`` call to a variable. This avoids + unnecessary event objects being created when not needed. For example:: + + forall>(my_cuda_res, ... + + will *not* generate a cudaStreamEvent, whereas:: + + RAJA::resources::Event e = forall>(my_cuda_res, ... + + will generate a cudaStreamEvent. + +------- +Example +------- + +This example executes three kernels across two cuda streams on the GPU with +a requirement that the first and second kernel finish execution before +launching the third. It also demonstrates copying memory from the device +to host on a resource: + +First, define two concrete CUDA resources and one host resource: + +.. literalinclude:: ../../../../examples/resource-forall.cpp + :start-after: _raja_res_defres_start + :end-before: _raja_res_defres_end + :language: C++ + +Next, allocate data for two device arrays and one host array: + +.. literalinclude:: ../../../../examples/resource-forall.cpp + :start-after: _raja_res_alloc_start + :end-before: _raja_res_alloc_end + :language: C++ + +Then, Execute a kernel on CUDA stream 1 ``res_gpu1``: + +.. literalinclude:: ../../../../examples/resource-forall.cpp + :start-after: _raja_res_k1_start + :end-before: _raja_res_k1_end + :language: C++ + +and execute another kernel on CUDA stream 2 ``res_gpu2`` storing a handle to +an ``Event`` object to a local variable: + +.. literalinclude:: ../../../../examples/resource-forall.cpp + :start-after: _raja_res_k2_start + :end-before: _raja_res_k2_end + :language: C++ + +The next kernel on ``res_gpu1`` requires that the last kernel on ``res_gpu2`` +finish first. Therefore, we enqueue a wait on ``res_gpu1`` that enforces +this: + +.. literalinclude:: ../../../../examples/resource-forall.cpp + :start-after: _raja_res_wait_start + :end-before: _raja_res_wait_end + :language: C++ + +Execute the second kernel on ``res_gpu1`` now that the two previous kernels +have finished: + +.. literalinclude:: ../../../../examples/resource-forall.cpp + :start-after: _raja_res_k3_start + :end-before: _raja_res_k3_end + :language: C++ + +We can enqueue a memcpy operation on ``res_gpu1`` to move data from the device +to the host: + +.. literalinclude:: ../../../../examples/resource-forall.cpp + :start-after: _raja_res_memcpy_start + :end-before: _raja_res_memcpy_end + :language: C++ + +Lastly, we use the copied data on the host side: + +.. literalinclude:: ../../../../examples/resource-forall.cpp + :start-after: _raja_res_k4_start + :end-before: _raja_res_k4_end + :language: C++ diff --git a/docs/sphinx/user_guide/feature/scan.rst b/docs/sphinx/user_guide/feature/scan.rst index f0203663eb..3730e08d60 100644 --- a/docs/sphinx/user_guide/feature/scan.rst +++ b/docs/sphinx/user_guide/feature/scan.rst @@ -28,13 +28,12 @@ A few important notes: Also: -.. note:: For scans using the CUDA back-end, RAJA uses the implementations - provided by the NVIDIA cub library, which is available in the - RAJA source repository as a Git submodule. The CMake variable - ``CUB_DIR`` will be automatically set to the location of the cub - library when CUDA is enabled; to use a different version of the - cub library, install it and set the ``CUB_DIR`` variable to the - desired location when running CMake. +.. note:: For scans using the CUDA back-end, RAJA uses the NVIDIA cub library + internally, which is available in the RAJA source repository as a + Git submodule. The CMake variable ``CUB_DIR`` will be automatically + set to the location of the cub library when CUDA is enabled. Details + for using a different version of the cub library are available in + the :ref:`getting_started-label` section. Please see the :ref:`scan-label` tutorial section for usage examples of RAJA scan operations. @@ -111,6 +110,8 @@ Using RAJA exclusive scans is essentially the same as for inclusive scans: * ``RAJA::exclusive_scan< exec_policy >(in, in + N, out)`` * ``RAJA::exclusive_scan< exec_policy >(in, in + N, out, operator)`` +and + * ``RAJA::exclusive_scan_inplace< exec_policy >(in, in + N)`` * ``RAJA::exclusive_scan_inplace< exec_policy >(in, in + N, )`` diff --git a/docs/sphinx/user_guide/feature/sort.rst b/docs/sphinx/user_guide/feature/sort.rst new file mode 100644 index 0000000000..c172559b49 --- /dev/null +++ b/docs/sphinx/user_guide/feature/sort.rst @@ -0,0 +1,161 @@ +.. ## +.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +.. ## and other RAJA project contributors. See the RAJA/COPYRIGHT file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _sort-label: + +================ +Sorts +================ + +RAJA provides portable parallel sort operations, which are basic +parallel algorithm building blocks. They are described in this section. + +A few important notes: + +.. note:: * All RAJA sort operations are in the namespace ``RAJA``. + * Each RAJA sort operation is a template on an *execution policy* + parameter. The same policy types used for ``RAJA::forall`` methods + may be used for RAJA sorts. + * RAJA sort operations accept an optional *comparator* argument so + users can perform different types of sort operations. If + no operator is given, the default is a 'less' operation and + the result is **non-decreasing**. + +Also: + +.. note:: * For sorts using the CUDA back-end, RAJA uses the implementations + provided by the NVIDIA cub library. For information please see + :ref:`build-external-tpl `. + * The RAJA CUDA back-end implementation only supports sorting + arithmetic types using RAJA operators less and greater. + +Please see the :ref:`sort-label` tutorial section for usage examples of RAJA +sort operations. + +----------------- +Sort Operations +----------------- + +In general, a sort operation takes a sequence of numbers ``x`` and a binary +comparison operator ``op`` that forms a strict weak ordering of elements in input +sequence ``x`` and produces a sequence of numbers ``y`` as output. The output sequence +is a permutation of the input sequence where each pair of elements ``a`` and ``b``, +where ``a`` is before ``b`` in the output sequence, satisfies ``!(b op a)``. +Sorts are stable if they always preserve the order of equivalent elements, +where equivalent elements satisfy ``!(a op b) && !(b op a)``. + +A **stable sort** takes an input sequence ``x`` where equivalent elements a\ :sub:`i` +and a\ :sub:`j` for any i != j where a\ :sub:`i` appears before a\ :sub:`j` if i < j + + x = { a\ :sub:`0`\, b\ :sub:`0`\, a\ :sub:`1`\, ... } + +and calculates the stably sorted output sequence ``y`` which preserves the order of +equivalent elements, in other words the sorted sequence where element a\ :sub:`i` +appears before the equivalent element a\ :sub:`j` if i < j: + + y = { a\ :sub:`0`\, a\ :sub:`1`\, b\ :sub:`0`\, ... } + +An **unstable sort** may not preserve the order of equivalent elements and +may produce either of the following output sequences. + + y = { a\ :sub:`0`\, a\ :sub:`1`\, b\ :sub:`0`\, ... } + + or + + y = { a\ :sub:`1`\, a\ :sub:`0`\, b\ :sub:`0`\, ... } + +--------------------- +RAJA Unstable Sorts +--------------------- + +RAJA unstable sort operations look like the following: + + * ``RAJA::sort< exec_policy >(container)`` + * ``RAJA::sort< exec_policy >(container, comparator)`` + * ``RAJA::sort< exec_policy >(iter, iter + N)`` + * ``RAJA::sort< exec_policy >(iter, iter + N, comparator)`` + +For example sorting the ``in`` array filled with this sequence of values:: + + 6 7 2 1 0 9 4 8 5 3 4 9 6 3 7 0 1 8 2 5 + +by performing a sequential unstable sort operation using the following code: + +.. literalinclude:: ../../../../examples/tut_sort.cpp + :start-after: _sort_seq_start + :end-before: _sort_seq_end + :language: C++ + +fills the ``out`` array with this sequence of values:: + + 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 + +Note that the syntax is essentially the same as :ref:`scan-label`. +Here, ``container`` is a range of elements and ``iter`` is a random access +iterator to a range of elements. ``container`` and ``iter`` provide access to the +input sequence and contain the output sequence at the end of sort. The first +and third sort operations above will be *non-decreasing* sorts since there is +no comparator argument given; i.e., the sequences will be reordered *in-place* +using operator::less. The second and fourth sorts will apply the comparator +that is passed into the function. + +RAJA also provides sort pairs that operate on key, value pairs stored +separately: + + * ``RAJA::sort_pairs< exec_policy >(keys_container, vals_container)`` + * ``RAJA::sort_pairs< exec_policy >(keys_container, vals_container, comparator)`` + * ``RAJA::sort_pairs< exec_policy >(keys_iter, keys_iter + N, vals_iter)`` + * ``RAJA::sort_pairs< exec_policy >(keys_iter, keys_iter + N, vals_iter, comparator)`` + +Sort pairs generates the same output sequence of keys in ``keys_container`` or +``keys_iter`` as sort does in ``container`` or ``iter`` and also reorders the sequence +of values in ``vals_container`` or ``vals_iter`` by permuting the sequence of values +in the same manner as the sequence of keys; i.e. sorting the sequence of pairs +based on their keys. Note that the ``comparator`` used in sort_pairs only compares +keys. + +--------------------- +RAJA Stable Sorts +--------------------- + +Using RAJA stable sorts is essentially the same as unstable sorts: + + * ``RAJA::stable_sort< exec_policy >(container)`` + * ``RAJA::stable_sort< exec_policy >(container, comparator)`` + * ``RAJA::stable_sort< exec_policy >(iter, iter + N)`` + * ``RAJA::stable_sort< exec_policy >(iter, iter + N, comparator)`` + +RAJA also provides stable sort pairs that operate on key, value pairs stored +separately: + + * ``RAJA::stable_sort_pairs< exec_policy >(keys_container, vals_container)`` + * ``RAJA::stable_sort_pairs< exec_policy >(keys_container, vals_container, comparator)`` + * ``RAJA::stable_sort_pairs< exec_policy >(keys_iter, keys_iter + N, vals_iter)`` + * ``RAJA::stable_sort_pairs< exec_policy >(keys_iter, keys_iter + N, vals_iter, comparator)`` + +.. _sortops-label: + +-------------------- +RAJA Comparison Operators +-------------------- + +RAJA provides two operators that can be used to produce different ordered sorts: + + * ``RAJA::operators::less`` + * ``RAJA::operators::greater`` + +.. note:: * All RAJA comparison operators are in the namespace ``RAJA::operators``. + +------------------- +Sort Policies +------------------- + +For information about RAJA execution policies to use with sort operations, +please see :ref:`policies-label`. + + diff --git a/docs/sphinx/user_guide/feature/tiling.rst b/docs/sphinx/user_guide/feature/tiling.rst index b2bb1316df..907905b743 100644 --- a/docs/sphinx/user_guide/feature/tiling.rst +++ b/docs/sphinx/user_guide/feature/tiling.rst @@ -16,8 +16,8 @@ In this section, we discuss RAJA statements that can be used to tile nested for-loops. Typical loop tiling involves partitioning an iteration space into a collection of "tiles" and then iterating over tiles in outer loops and entries within each tile in inner loops. Many scientific computing algorithms -can benefit from loop tiling due to more efficient cache usage and other -considerations. +can benefit from loop tiling due to more efficient cache usage on a CPU or +use of GPU shared memory. For example, an operation performed using a for-loop with a range of [0, 10):: @@ -44,7 +44,7 @@ statement types. using KERNEL_EXEC_POL = RAJA::KernelPolicy< - RAJA::statement::Tile<0, RAJA::statement::tile_fixed<2>, RAJA::seq_exec, + RAJA::statement::Tile<0, RAJA::tile_fixed<2>, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > @@ -59,11 +59,11 @@ statement types. In RAJA, the simplest way to tile an iteration space is to use RAJA ``statement::Tile`` and ``statement::For`` statement types. A ``statement::Tile`` type is similar to a ``statement::For`` type, but takes -a tile size as the second template argument. The Tile statement generates -the outer loop over tiles and the For statement iterates over each tile. -Nested together, as in the example, these statements will pass the global -index 'i' to the loop body in the lambda expression as in the non-tiled -version above. +a tile size as the second template argument. The ``statement::Tile`` +construct generates the outer loop over tiles and the ``statement::For`` +statement iterates over each tile. Nested together, as in the example, these +statements will pass the global index 'i' to the loop body in the lambda +expression as in the non-tiled version above. .. note:: When using ``statement::Tile`` and ``statement::For`` types together to define a tiled loop structure, the integer passed as the first @@ -71,13 +71,13 @@ version above. indicates that they both apply to the same item in the iteration space tuple passed to the ``RAJA::kernel`` methods. -RAJA also provides alternative Tile and For statements that provide the tile +RAJA also provides alternative tiling and for statements that provide the tile number and local tile index, if needed inside the kernel body, as shown below:: using KERNEL_EXEC_POL2 = RAJA::KernelPolicy< RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, - RAJA::statement::tile_fixed<2>, RAJA::seq_exec, + RAJA::tile_fixed<2>, RAJA::seq_exec, RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec, RAJA::statement::Lambda<0> @@ -98,16 +98,17 @@ number and local tile index, if needed inside the kernel body, as shown below:: }); The ``statement::TileTCount`` type allows the tile number to be accessed as a -parameter and the ``statement::ForICount`` type allows the local tile loop -index to be accessed. These values are specified in the tuple, which is the -second argument passed to the ``RAJA::kernel_param`` method above. The -``statement::Param<#>`` type appearing as the second template parameter for -each statement type indicates which parameter tuple entry the tile number -or local tile loop index is passed to the lambda, and in what order. Here, -the tile number is the second lambda argument (tuple parameter '0') and the -local tile loop index is the third lambda argument (tuple parameter '1'). +lambda argument and the ``statement::ForICount`` type allows the local tile +loop index to be accessed as a lambda argument. These values are specified in +the tuple, which is the second argument passed to the ``RAJA::kernel_param`` +method above. The ``statement::Param<#>`` type appearing as the second +template parameter for each statement type indicates which parameter tuple +entry the tile number or local tile loop index is passed to the lambda, and +in which order. Here, the tile number is the second lambda argument (tuple +parameter '0') and the local tile loop index is the third lambda argument +(tuple parameter '1'). .. note:: The global loop indices always appear as the first lambda expression - arguments. Then, the parameter tuples, identified by the integers + arguments. Then, the parameter tuples identified by the integers in the ``Param`` statement types given for the loop statement types follow. diff --git a/docs/sphinx/user_guide/feature/view.rst b/docs/sphinx/user_guide/feature/view.rst index dfc4472eff..d46930c1c6 100644 --- a/docs/sphinx/user_guide/feature/view.rst +++ b/docs/sphinx/user_guide/feature/view.rst @@ -12,8 +12,8 @@ View and Layout =============== -Matrix and tensor objects are naturally expressed in -scientific computing applications as multi-dimensional arrays. However, +Matrix and tensor objects, which are common in scientific computing +applications, are naturally expressed as multi-dimensional arrays. However, for efficiency in C and C++, they are usually allocated as one-dimensional arrays. For example, a matrix :math:`A` of dimension :math:`N_r \times N_c` is typically allocated as:: @@ -22,13 +22,13 @@ typically allocated as:: Using a one-dimensional array makes it necessary to convert two-dimensional indices (rows and columns of a matrix) to a one-dimensional -pointer offset index to access the corresponding array memory location. One -could introduce a macro such as:: +pointer offset to access the corresponding array memory location. One +could use a macro such as:: #define A(r, c) A[c + N_c * r] to access a matrix entry in row `r` and column `c`. However, this solution has -limitations; e.g., additional macro definitions are needed when adopting a +limitations; e.g., additional macro definitions may be needed when adopting a different matrix data layout or when using other matrices. To facilitate multi-dimensional indexing and different indexing layouts, RAJA provides ``RAJA::View`` and ``RAJA::Layout`` classes. @@ -37,8 +37,8 @@ multi-dimensional indexing and different indexing layouts, RAJA provides RAJA Views ---------- -A ``RAJA::View`` object wraps a pointer and enables various indexing schemes -based on the definition of a ``RAJA::Layout`` object. We can +A ``RAJA::View`` object wraps a pointer and enables indexing into the data +referenced via the pointer based on a ``RAJA::Layout`` object. We can create a ``RAJA::View`` for a matrix with dimensions :math:`N_r \times N_c` using a RAJA View and a default RAJA two-dimensional Layout as follows:: @@ -52,7 +52,7 @@ extent of each matrix dimension as arguments. The template parameters to the ``RAJA::View`` type define the pointer type and the Layout type; here, the Layout just defines the number of index dimensions. Using the resulting view object, one may access matrix entries in a row-major fashion (the -default RAJA layout) through the View parenthesis operator:: +default RAJA layout) through the view *parenthesis operator*:: // r - row index of a matrix // c - column index of a matrix @@ -82,6 +82,54 @@ accesses array entries with unit stride. The loop:: access array entries with stride N :subscript:`n` * N :subscript:`(n-1)` * ... * N :subscript:`(j+1)`. +MultiView +^^^^^^^^^^^^^^^^ + +A ``RAJA::MultiView`` object wraps an array-of-pointers, +or a pointer-to-pointers, whereas a ``RAJA::View`` wraps a single +pointer or array. This allows a single ``RAJA::Layout`` to be applied to +multiple arrays internal to the MultiView, allowing multiple arrays to share indexing +arithmetic when their access patterns are the same. + +The instantiation of a MultiView works exactly like a standard View, +except that it takes an array-of-pointers. In the following example, a MultiView +applies a 1-D layout of length 4 to 2 internal arrays in ``myarr``. + +.. literalinclude:: ../../../../examples/multiview.cpp + :start-after: _multiview_example_1Dinit_start + :end-before: _multiview_example_1Dinit_end + :language: C++ + +The default MultiView accesses internal arrays via the 0th position of the MultiView. + +.. literalinclude:: ../../../../examples/multiview.cpp + :start-after: _multiview_example_1Daccess_start + :end-before: _multiview_example_1Daccess_end + :language: C++ + +The index into the array-of-pointers can be moved to different +indices of the MultiView ``()`` access operator, rather than the default 0th position. By +passing a third template parameter to the MultiView constructor, the internal array index +and the integer indicating which array to access can be reversed. + +.. literalinclude:: ../../../../examples/multiview.cpp + :start-after: _multiview_example_1Daopindex_start + :end-before: _multiview_example_1Daopindex_end + :language: C++ + +As the number of Layout dimensions increases, the index into the array-of-pointers can be +moved to more distinct locations in the MultiView ``()`` access operator. Here is an example +which compares the accesses of a 2-D layout on a normal ``RAJA::View`` with a ``RAJA::MultiView`` +with the array-of-pointers index set to the 2nd position. + +.. literalinclude:: ../../../../examples/multiview.cpp + :start-after: _multiview_example_2Daopindex_start + :end-before: _multiview_example_2Daopindex_end + :language: C++ + +.. note:: MultiView does not currently work with Layouts which use strongly + typed indices. It has not been tested yet with atomic accesses. + ------------ RAJA Layouts ------------ @@ -90,7 +138,7 @@ RAJA Layouts striding orders, offsets, and permutations. In addition to layouts created using the default Layout constructor, as shown above, RAJA provides other methods to generate layouts for different indexing patterns. We describe -these next. +them here. Permuted Layout ^^^^^^^^^^^^^^^^ @@ -114,11 +162,12 @@ second index (index 1 - extent 7) has stride 55 (= 5*11). The first argument to ``RAJA::make_permuted_layout`` is a C++ array whose entries define the extent of each index dimension. **The double braces are -required to prevent compilation errors/warnings about issues trying to -initialize a sub-object.** The second argument is the striding permutation. +required to properly initialize the internal sub-object which holds the +extents.** The second argument is the striding permutation and similarly +requires double braces. -In the next example, we create the same permuted layout, then create -a ``RAJA::View`` with it in a way that tells the View which index has +In the next example, we create the same permuted layout as above, then create +a ``RAJA::View`` with it in a way that tells the view which index has unit stride:: const int s0 = 5; // extent of dimension 0 @@ -131,18 +180,19 @@ unit stride:: RAJA::Layout<3> layout = RAJA::make_permuted_layout( {{s0, s1, s2}}, perm ); - // The Layout template parameters are dimension, 'linear index' type, - // and the index with unit stride - RAJA::View > Bview(B, layout); + // The Layout template parameters are dimension, 'linear index' type used + // when converting an index triple into the corresponding pointer offset + // index, and the index with unit stride + RAJA::View > Bview(B, layout); // Equivalent to indexing as: B[i + j * s0 * s2 + k * s0] Bview(i, j, k) = ...; .. note:: Telling a view which index has unit stride makes the multi-dimensional index calculation more efficient by avoiding - multiplication by '1' when it is unnecessary. **This must be done - so that the layout permutation and unit-stride index specification - are the same to prevent incorrect indexing.** + multiplication by '1' when it is unnecessary. **The layout + permutation and unit-stride index specification + must be consistent to prevent incorrect indexing.** Offset Layout ^^^^^^^^^^^^^^^^ @@ -164,9 +214,15 @@ it using indices in :math:`[-5, 5]`. In other words, one can use the loop:: } to initialize the values of the array. Each 'i' loop index value is converted -to array offset access index by subtracting the lower offset to it; i.e., in +to an array offset index by subtracting the lower offset from it; i.e., in the loop, each 'i' value has '-5' subtracted from it to properly access the -array entry. +array entry. That is, the sequence of indices generated by the for-loop:: + + -5 -4 -3 ... 5 + +will index into the data array as:: + + 0 1 2 ... 10 The arguments to the ``RAJA::make_offset_layout`` method are C++ arrays that hold the start and end values of the indices. RAJA offset layouts support @@ -177,9 +233,8 @@ any number of dimensions; for example:: defines a two-dimensional layout that enables one to index into a view using indices :math:`[-1, 2]` in the first dimension and indices :math:`[-5, 5]` in -the second dimension. As we remarked earlier, double braces are needed to -prevent compilation errors/warnings about issues trying to initialize a -sub-object. +the second dimension. As noted earlier, double braces are needed to +properly initialize the internal data in the layout object. Permuted Offset Layout ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -195,13 +250,9 @@ indices. For example,:: Here, the two-dimensional index space is :math:`[-1, 2] \times [-5, 5]`, the same as above. However, the index strides are permuted so that the first index (index 0) has unit stride and the second index (index 1) has stride 4, -since the first index dimension has length 4. +which is the extent of the first index (:math:`[-1, 2]`). -Complete examples illustrating ``RAJA::Layouts`` and ``RAJA::Views`` may -be found in the :ref:`offset-label` and :ref:`permuted-layout-label` -tutorial sections. - -.. note:: It is important to note some facts about RAJA Layout types. +.. note:: It is important to note some facts about RAJA layout types. All layouts have a permutation. So a permuted layout and a "non-permuted" layout (i.e., default permutation) has the type ``RAJA::Layout``. Any layout with an offset has the @@ -210,14 +261,19 @@ tutorial sections. choice to avoid the overhead of offset computations in the ``RAJA::View`` data access operator when they are not needed. +Complete examples illustrating ``RAJA::Layouts`` and ``RAJA::Views`` may +be found in the :ref:`offset-label` and :ref:`permuted-layout-label` +tutorial sections. + Typed Layouts ^^^^^^^^^^^^^ RAJA provides typed variants of ``RAJA::Layout`` and ``RAJA::OffsetLayout`` -enabling user specified index types. Basic usage requires specifying types for -the linear index, and the multi-dimensional indicies. The following example creates -typed layouts wherein the linear index is of type TIL and the multidimensional -indices are TIX, TIY,:: +that enable users to specify integral index types. Usage requires +specifying types for the linear index and the multi-dimensional indicies. +The following example creates two two-dimensional typed layouts where the +linear index is of type TIL and the '(x, y)' indices for accesingg the data +have types TIX and TIY:: RAJA_INDEX_VALUE(TIX, "TIX"); RAJA_INDEX_VALUE(TIY, "TIY"); @@ -226,13 +282,18 @@ indices are TIX, TIY,:: RAJA::TypedLayout> layout(10, 10); RAJA::TypedOffsetLayout> offLayout(10, 10);; +.. note:: Using the ``RAJA_INDEX_VALUE`` macro to create typed indices + is helpful to prevent incorrect usage by detecting at compile + when, for example, indices are passes to a view parenthesis + operator in the wrong order. + Shifting Views ^^^^^^^^^^^^^^ -RAJA Views include a shift method enabling users to generate a new View with -offsets to the base View layout. The base View may be templated with either a -standard Layout, OffsetLayout and the typed variants. The generated View will -use an OffsetLayout or TypedOffsetLayout depending on whether the base +RAJA views include a shift method enabling users to generate a new view with +offsets to the base view layout. The base view may be templated with either a +standard layout or offset layout and their typed variants. The new view will +use an offset layout or typed offset layout depending on whether the base view employed a typed layout. The example below illustrates shifting view indices by :math:`N`, :: @@ -264,17 +325,18 @@ three-dimensional index space to a one-dimensional linear space:: RAJA::Layout<3> layout(5, 7, 11); // Map from 3-D index (2, 3, 1) to the linear index - // Note that there is no striding permutation, so rightmost is stride-1 + // Note that there is no striding permutation, so the rightmost index is + // stride-1 int lin = layout(2, 3, 1); // lin = 188 (= 1 + 3 * 11 + 2 * 11 * 7) // Map from linear index to 3-D index int i, j, k; layout.toIndices(lin, i, j, k); // i,j,k = {2, 3, 1} -``RAJA::Layout`` also supports *projections*, where one or more dimension +RAJA layouts also support *projections*, where one or more dimension extent is zero. In this case, the linear index space is invariant for -those multi-dimensional index entries; thus, the 'toIndicies(...)' method -will always return zero for each dimension with zero extent. For example:: +those index entries; thus, the 'toIndicies(...)' method will always return +zero for each dimension with zero extent. For example:: // Create a layout with second dimension extent zero RAJA::Layout<3> layout(3, 0, 5); @@ -283,7 +345,7 @@ will always return zero for each dimension with zero extent. For example:: int lin1 = layout(0, 10, 0); // lin1 = 0 int lin2 = layout(0, 5, 1); // lin2 = 1 - // The inverse mapping always produces a 0 for j + // The inverse mapping always produces zero for j int i,j,k; layout.toIndices(lin2, i, j, k); // i,j,k = {0, 0, 1} @@ -311,7 +373,7 @@ way to do this in parallel using OpenMP and a RAJA atomic view:: // Create a 1-dimensional view for histogram array RAJA::View > hist_view(hist_dat, M); - // Create an atomic view for histogram array + // Create an atomic view into the histogram array using the view above auto hist_atomic_view = RAJA::make_atomic_view(hist_view); RAJA::forall< EXEC_POL >(RAJA::RangeSegment(0, N), [=] (int i) { @@ -321,16 +383,16 @@ way to do this in parallel using OpenMP and a RAJA atomic view:: Here, we create a one-dimensional view for the histogram data array. Then, we create an atomic view from that, which we use in the RAJA loop to compute the histogram entries. Since the view is atomic, only one OpenMP -thread can write to each entry at a time. +thread can write to each array entry at a time. ------------------------------------ RAJA View/Layouts Bounds Checking ------------------------------------ The RAJA CMake variable ``RAJA_ENABLE_BOUNDS_CHECK`` may be used to turn on/off -runtime bounds checking for RAJA Views. This may be a useful debugging aid for -users. When bounds checkoing is turned off (default case), there is no -additional run time overhead incurred. Bounds checking is accomplished within -RAJA layouts (both offset and standard layouts). Upon an out of bounds error, -RAJA will abort the program and print the index that is out of bounds as -well the value of the index and bounds. +runtime bounds checking for RAJA views. This may be a useful debugging aid for +users. When attempting to use an index value that is out of bounds, +RAJA will abort the program and print the index that is out of bounds and +the value of the index and bounds for it. Since the bounds checking is a runtime +operation, it incurs non-negligible overhead. When bounds checkoing is turned +off (default case), there is no additional run time overhead incurred. diff --git a/docs/sphinx/user_guide/feature/workgroup.rst b/docs/sphinx/user_guide/feature/workgroup.rst new file mode 100644 index 0000000000..4a89e5b3a2 --- /dev/null +++ b/docs/sphinx/user_guide/feature/workgroup.rst @@ -0,0 +1,303 @@ +.. ## +.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +.. ## and other RAJA project contributors. See the RAJA/COPYRIGHT file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _workgroup-label: + +========= +WorkGroup +========= + +In this section, we describe the basics of RAJA workgroups. +``RAJA::WorkPool``, ``RAJA::WorkGroup``, and ``RAJA::WorkSite`` class templates comprise the +RAJA interface for grouped loop execution. ``RAJA::WorkPool`` takes a set of simple +loops (e.g., non-nested loops) and instantiates a ``RAJA::WorkGroup``. ``RAJA::WorkGroup`` +represents an executable form of those loops and when run makes a ``RAJA::WorkSite``. +``RAJA::WorkSite`` holds all of the resources used for a single run of the loops. Be aware +that the RAJA workgroup constructs API is still being developed and may change in later RAJA +releases. + +.. note:: * All **workgroup** constructs are in the namespace ``RAJA``. + * The ``RAJA::WorkPool``, ``RAJA::WorkGroup``, and ``RAJA::WorkSite`` class templates + are templated on: + * a WorkGroup policy which is composed of: + * a work execution policy. + * a work ordering policy. + * a work storage policy. + * an index type that is the first argument to the loop bodies. + * a list of extra argument types that are the rest of the arguments to + the loop bodies. + * an allocator type to be used for the memory used to store and + manage the loop bodies. + * The ``RAJA::WorkPool::enqueue`` method takes two arguments: + * an iteration space object, and + * a lambda expression representing the loop body. + +Examples showing how to use RAJA workgroup methods may be found in +the :ref:`tutorial-label`. + +For more information on RAJA work policies and iteration space constructs, +see :ref:`policies-label` and :ref:`index-label`, respectively. + +.. _workgroup-Policies-label: + +-------- +Policies +-------- + +The behavior of the RAJA workgroup constructs is determined by a policy. +The ``RAJA::WorkGroupPolicy`` has three components, a work execution policy, +a work ordering policy, and a work storage policy. ``RAJA::WorkPool``, +``RAJA::WorkGroup``, and ``RAJA::WorkSite`` class templates all +take the same policy and template arguments. For example:: + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::seq_work, + RAJA::ordered, + RAJA::ragged_array_of_objects >; + +is a workgroup policy that will run loops sequentially on the host in the order +they were enqueued and store the loop bodies sequentially in single buffer in +memory. + +The work execution policy acts like the execution policies used with ``RAJA::forall`` +and determines the backend used to run the loops and the parallelism within each +loop. + + ====================================== ======================================== + Work Execution Policies Brief description + ====================================== ======================================== + seq_work Execute loop iterations strictly + sequentially. + simd_work Execute loop iterations sequentially and + try to force generation of SIMD + instructions via compiler hints in RAJA + internal implementation. + loop_work Execute loop iterations sequentially and + allow compiler to generate any + optimizations. + omp_work Execute loop iterations in parallel + using OpenMP. + tbb_work Execute loop iterations in parallel + using TBB. + cuda_work, Execute loop iterations in parallel + cuda_work_async using a CUDA kernel launched with given + thread-block size. + omp_target_work Execute loop iterations in parallel + using OpenMP target. + ====================================== ======================================== + +The work ordering policy acts like the segment iteration execution policies when +``RAJA::forall`` is used with a ``RAJA::IndexSet`` and determines the backend +used when iterating over the loops and the parallelism between each loop. + + ====================================== ======================================== + Work Execution Policies Brief description + ====================================== ======================================== + ordered Execute loops sequentially in the order + they were enqueued using forall. + reverse_ordered Execute loops sequentially in the + reverse of the order order they were + enqueued using forall. + unordered_cuda_loop_y_block_iter_x_threadblock_average + Execute loops in parallel by mapping + each loop to a set of cuda blocks with + the same index in the y direction in + a cuda kernel. Each loop is given a + number of threads over one of more + blocks in the x direction equal to the + average number of iterations of all the + loops rounded up to a multiple of the + block size. + ====================================== ======================================== + +The work storage policy determines the strategy used to allocate and layout the +storage used to store the ranges, loop bodies, and other data necessary to +implement the workstorage constructs. + + ====================================== ======================================== + Work Storage Policies Brief description + ====================================== ======================================== + array_of_pointers Store loop data in individual + allocations and keep an array of + pointers to the individual loop data + allocations. + ragged_array_of_objects Store loops sequentially in a single + allocation, reallocating and moving the + loop data items as needed, and keep an + array of offsets to the individual loop + data items. + constant_stride_array_of_objects Store loops sequentially in a single + allocation with a consistent stride + between loop data items, reallocating + and/or changing the stride and moving + the loop data items as needed. + ====================================== ======================================== + + +.. _workgroup-Arguments-label: + +--------- +Arguments +--------- + +The next two template arguments to the workgroup constructs determine the +call signature of the loop bodies that may be added to the workgroup. The first +is an index type which is the first parameter in the call signature. Next is a +list of types called ``RAJA::xargs``, short for extra arguments, that gives the +rest of the types of the parameters in the call signature. The values of the +extra arguments are passed in when the loops are run, see :ref:`workgroup-WorkGroup-label`. +For example:: + + int, RAJA::xargs<> + +can be used with lambdas with the following signature:: + + [=](int) { ... } + +and:: + + int, RAJA::xargs + +can be used with lambdas with the following signature:: + + [=](int, int*, double) { ... } + + +.. _workgroup-Allocators-label: + +---------- +Allocators +---------- + +The last template argument to the workgroup constructs is an allocator type +that conforms to the allocator named requirement used in the standard library. +This gives you control over how memory is allocated, for example with umpire, +and what memory space is used, both of which have poerformance implications. +Find the requirements for allocator types along with a simple example here +https://en.cppreference.com/w/cpp/named_req/Allocator. The default allocator +used by the standard template library may be used with ordered and non-GPU +policies:: + + using Allocator = std::allocator; + +.. note:: * The allocator type must use template argument char. + * Allocators must provide memory that is accessible where it is used. + * Ordered work order policies only require memory that is accessible + where loop bodies are enqueued. + * Unordered work order policies require memory that is accessible + from both where the loop bodies are enqueued and from where the + loop is executed based on the work execution policy. + * For example when using cuda work exeution policies with cuda + unordered work order policies pinned memory is a good choice + because it is always accessible on the host and device. + + +.. _workgroup-WorkPool-label: + +-------- +WorkPool +-------- + +The ``RAJA::WorkPool`` class template holds a set of simple (e.g., non-nested) +loops that are enqueued one at a time. For example, to enqueue a C-style loop +that adds two vectors, like:: + + for (int i = 0; i < N; ++i) { + c[i] = a[i] + b[i]; + } + +is as simple as calling enqueue on a ``RAJA::WorkPool`` object and passing the +same arguments you would pass to ``RAJA::forall``.:: + + using WorkPool_type = RAJA::WorkPool< workgroup_policy, + int, RAJA::xargs<>, + Allocator >; + WorkPool_type workpool(Allocator{}); + + workpool.enqueue(RAJA::RangeSegment(0, N), [=] (int i) { + c[i] = a[i] + b[i]; + }); + +Note that WorkPool may have to allocate and reallocate multiple times to store +a set of loops depending on the work storage policy. Reallocation can be avoided +by reserving enough memory before adding any loops.:: + + workpool.reserve(num_loops, storage_bytes); + +Here ``num_loops`` is the number of loops to allocate space for and +``num_storage_bytes`` is the amount of storage to allocate. These may be used +differently depending on the work storage policy. The number of loops +enqueued in a ``RAJA::WorkPool`` and the amount of storage used may be queried +using:: + + size_t num_loops = workpool.num_loops(); + size_t storage_bytes = workpool.storage_bytes(); + +Storage will automatically reserved when reusing a `RAJA::WorkPool`` object +based on the maximum seen values for num_loops and storage_bytes. + +When you've added all the loops you want to the set, you can call instantiate +on the ``RAJA::WorkPool`` to generate a ``RAJA::WorkGroup``.:: + + WorkGroup_type workgroup = workpool.instantiate(); + +.. _workgroup-WorkGroup-label: + +--------- +WorkGroup +--------- + +The ``RAJA::WorkGroup`` class template is responsible for hanging onto the set +of loops and running the loops. The ``RAJA::WorkGroup`` owns its loops and must +not be destroyed before any loops run asynchronously using it have completed. +It is instantiated from a ``RAJA::WorkPool`` object which transfers ownership +of a set of loops to the ``RAJA::WorkGroup`` and prepares the loops to be run. +For example:: + + using WorkGroup_type = RAJA::WorkGroup< workgroup_policy, + int, RAJA::xargs<>, + Allocator >; + WorkGroup_type workgroup = workpool.instantiate(); + +creates a ``RAJA::WorkGroup`` ``workgroup`` from the loops in ``workpool`` and +leaves ``workpool`` empty and ready for reuse. When you want to run the loops +simply call run on ``workgroup`` and pass in the extra arguments:: + + WorkSite_type worksite = workgroup.run(); + +In this case no extra arguments were passed to run because the ``RAJA::WorkGroup`` +specified no extra arguments ``RAJA::xargs<>``. Passing extra arguments when the +loops are run lets you delay creation of those arguments until you plan to run +the loops. This lets the value of the arguments depend on the loops in the set. +A simple example of this may be found in the tutorial here :ref:`tutorial-label`. +Run produces a ``RAJA::WorkSite`` object. + + +.. _workgroup-WorkSite-label: + +-------- +WorkSite +-------- + +The ``RAJA::WorkSite`` class template is responsible for extending the lifespan +of objects used when running loops asynchronously. This means that the +``RAJA::WorkSite`` object must remain alive until the call to run has been +synchronized. For example the scoping here:: + + { + using WorkSite_type = RAJA::WorkSite< workgroup_policy, + int, RAJA::xargs<>, + Allocator >; + WorkSite_type worksite = workgroup.run(); + + // do other things + + synchronize(); + } + +ensures that ``worksite`` survives until after synchronize is called. diff --git a/docs/sphinx/user_guide/features.rst b/docs/sphinx/user_guide/features.rst index 608c90bec2..8b9caf6b27 100644 --- a/docs/sphinx/user_guide/features.rst +++ b/docs/sphinx/user_guide/features.rst @@ -12,7 +12,7 @@ RAJA Features ************************ -The following sections describe key aspects of the main RAJA features. +The following sections describe key aspects of the main RAJA features. .. toctree:: :maxdepth: 2 @@ -22,7 +22,11 @@ The following sections describe key aspects of the main RAJA features. feature/iteration_spaces feature/view feature/reduction + feature/resource feature/atomic feature/scan + feature/sort feature/local_array feature/tiling + feature/plugins + feature/workgroup \ No newline at end of file diff --git a/docs/sphinx/user_guide/getting_started.rst b/docs/sphinx/user_guide/getting_started.rst index 535301fc89..c48705b2a6 100644 --- a/docs/sphinx/user_guide/getting_started.rst +++ b/docs/sphinx/user_guide/getting_started.rst @@ -22,8 +22,8 @@ Requirements The primary requirement for using RAJA is a C++11 compliant compiler. Accessing various programming model back-ends requires that they be supported by the compiler you chose. Available options and how to enable or disable -them are described in :ref:`configopt-label`. To build and use RAJA in its -simplest form requires: +them are described in :ref:`configopt-label`. To build RAJA in its most basic +form and use its simplest features: - C++ compiler with C++11 support - `CMake `_ version 3.9 or greater. @@ -39,13 +39,13 @@ the command:: $ git clone --recursive https://github.com/LLNL/RAJA.git -The ``--recursive`` argument above is needed to pull in other projects -RAJA depends on as Git *submodules*. Currently, RAJA submodule dependencies -are: +The ``--recursive`` argument above is needed to pull in necessary RAJA +dependencies as Git *submodules*. Current RAJA dependencies are: - `BLT build system `_ -- `Camp portable utility library `_ -- `NVIDIA CUB `_ +- `Camp compiler agnostic metaprogramming library `_ +- `CUB CUDA utilities library `_ +- `rocPRIM Hip parallel primitives library `_ You probably don't need to know much about these other projects to start using RAJA. But, if you want to know more about them, click on the links above. @@ -54,7 +54,7 @@ After running the clone command, a copy of the RAJA repository will reside in a ``RAJA`` subdirectory where you ran the clone command. You will be on the ``develop`` branch of RAJA, which is our default branch. -If you forget to pass the ``--recursive`` argument to the ``git clone`` +If you do not pass the ``--recursive`` argument to the ``git clone`` command, you can type the following commands after cloning:: $ cd RAJA @@ -72,57 +72,65 @@ Build and Install ================== Building and installing RAJA can be very easy or more complicated, depending -on which features you want to use and how well you understand how to use -your system. +on which features you want to use and how easy it is to use your system. -------------- Building RAJA -------------- -RAJA uses CMake to configure a build. A basic configuration looks like:: +RAJA uses CMake to configure a build. A "bare bones" configuration looks like:: $ mkdir build-dir && cd build-dir $ cmake -DCMAKE_INSTALL_PREFIX=/path/to/install ../ .. note:: * RAJA requires a minimum CMake version of 3.9. * Builds must be *out-of-source*. RAJA does not allow building in - the source directory, so you must create a build directory. + the source directory, so you must create a build directory and + run CMake in it. -When you run CMake, it will provide output about the compiler that has been -found and which features are discovered. Some RAJA features, like OpenMP -support are enabled if they are discovered. For a complete summary of -configuration options, please see :ref:`configopt-label`. +When you run CMake, it will generate output about the build environment +(compiler and version, options, etc.). Some RAJA features, +like OpenMP support are enabled by default if, for example, the compiler +supports OpenMP. These can be disabled if desired. For a summary of +RAJA configuration options, please see :ref:`configopt-label`. After CMake successfully completes, you compile RAJA by executing the ``make`` command in the build directory; i.e.,:: - $ cd build-dir $ make -If you have access to a multi-core system you can compile in parallel by running -``make -j`` (to build with all available cores) or ``make -j N`` to build using -N cores. +If you have access to a multi-core system, you can compile in parallel by +running ``make -j`` (to build with all available cores) or ``make -j N`` to +build using N cores. -.. note:: RAJA is configured to build its unit tests by default. If you do not - disable them with the appropriate CMake option, you can run them - after the build completes to check if everything compiled properly. - The easiest way to do this is to type:: +.. note:: * RAJA is configured to build its unit tests by default. If you do not + disable them with the appropriate CMake option (please see + :ref:`configopt-label`), you can run them after the build completes + to check if everything is built properly. - $ make test + The easiest way to run the full set of RAJA tests is to type:: - after the build completes. + $ make test - You can also run individual tests by invoking individual test - executables directly. They live in subdirectories in the ``test`` - directory. RAJA tests use the - `Google Test framework `_, - so you can also run tests via Google Test commands. + in the build directory after the build completes. - It is very important to note that the version of Googletest that - is used in RAJA version v0.11.0 or newer requires CUDA version - 9.2.x or newer when compiling with nvcc. Thus, if you build - RAJA with CUDA enabled and want to also enable RAJA tests, you - must use CUDA version 9.2.x or newer. + You can also run individual tests by invoking test + executables directly. They will be located in the ``test`` + subdirectory in the build space directory. RAJA tests use the + `Google Test framework `_, + so you can also run tests via Google Test commands. + + * RAJA also contains example and tutorial exercise + programs you can run if you wish. Similar to the RAJA tests, + the examples and exercises are built by default and can be + disabled with CMake options (see :ref:`configopt-label`). The + source files for these are located in the ``RAJA/examples`` and + ``RAJA/exercises`` directories, respectively. When built, the + executables for the examples and exercises will be located in + the ``bin`` subdirectory in the build space directory. Feel free to + experiment by editing the source files and recompiling. + +.. _build-external-tpl-label: .. note:: You may use externally-supplied versions of the camp and cub libraries with RAJA if you wish. To do so, pass the following @@ -130,6 +138,102 @@ N cores. * External camp: -DEXTERNAL_CAMP_SOURCE_DIR= * External cub: -DENABLE_EXTERNAL_CUB=On -DCUB_DIR= +----------------- +GPU Builds, etc. +----------------- + +CUDA +^^^^^^ + +To run RAJA code on NVIDIA GPUs, one typically must have a CUDA compiler +installed on your system, in addition to a host code compiler. You may need +to specify both when you run CMake. The host compiler is specified using the +``CMAKE_CXX_COMPILER`` CMake variable. The CUDA compiler is specified with +the ``CMAKE_CUDA_COMPILER`` variable. + +When using the NVIDIA nvcc compiler for RAJA CUDA functionality, the variables: + + * CMAKE_CUDA_FLAGS_RELEASE + * CMAKE_CUDA_FLAGS_DEBUG + * CMAKE_CUDA_FLAGS_RELWITHDEBINFO + +which corresponding to the standard CMake build types are used to pass flags +to nvcc. + +.. note:: When nvcc must pass options to the host compiler, the arguments + can be included using these CMake variables. Host compiler + options must be prepended with the `-Xcompiler` directive. + +To set the CUDA compute architecture for the nvcc compiler, which should be +chosen based on the NVIDIA GPU hardware you are using, you can use the +``CUDA_ARCH`` CMake variable. For example, the CMake option:: + + -DCUDA_ARCH=sm_60 + +will tell the compiler to use the `sm_60` SASS architecture in its second +stage of compilation. It will pick the PTX architecture to use in the first +stage of compilation that is suitable for the SASS architecture you specify. + +Alternatively, you may specify the PTX and SASS architectures, using +appropriate nvcc options in the ``CMAKE_CUDA_FLAGS_*`` variables. + +.. note:: **RAJA requires a minimum CUDA architecture level of `sm_35` to use + all supported CUDA features.** Mostly, the architecture level affects + which RAJA CUDA atomic operations are available and how they are + implemented inside RAJA. This is described in :ref:`atomics-label`. + + * If you do not specify a value for ``CUDA_ARCH``, it will be set to + `sm_35` by default and CMake will emit a status message + indicatting this choice was made. + + * If you give a ``CUDA_ARCH`` value less than `sm_35` (e.g., `sm_30`), + CMake will report this and stop processing. + +Also, RAJA relies on the CUB CUDA utilities library for some CUDA functionality. +CUB is included with RAJA as a Git submodule and this version will be used if +you do not specify an alternative. To use an externally-supplied CUB library, +provide the following options to CMake: +``-DENABLE_EXTERNAL_CUB=On -DCUB_DIR=``. + +.. note:: It is important to note that the version of Googletest that + is used in RAJA version v0.11.0 or newer requires CUDA version + 9.2.x or newer when compiling with nvcc. Thus, if you build + RAJA with CUDA enabled and want to also enable RAJA tests, you + must use CUDA version 9.2.x or newer. + +Hip +^^^^ + +To run RAJA code on AMD GPUs, one typically uses the Hip compiler and tool +chain (which can also be used to compile code for NVIDIA GPUs). + +.. note:: RAJA requires version 3.5 or newer of the rocm software stack to + use the RAJA Hip back-end. + +OpenMP +^^^^^^^ + +To use OpenMP target offlad GPU execution, additional options may need to be +passed to the compiler. The variable ``OpenMP_CXX_FLAGS`` is used for this. +Option syntax follows the CMake *list* pattern. For example, to specify OpenMP +target options for NVIDIA GPUs using a clang-based compiler, one may do +something like:: + + cmake \ + .... + -DOpenMP_CXX_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda" + +---------------------------------------- +RAJA Example Build Configuration Files +---------------------------------------- + +The ``RAJA/scripts`` directory contains subdirectories with a variety of +build scripts we use to build and test RAJA on various platforms with +various compilers. These scripts pass files (*CMake cache files*) located in +the ``RAJA/host-configs`` directory to CMake using the '-C' option. +These files serve as useful examples of how to configure RAJA prior to +compilation. + ---------------- Installing RAJA ---------------- diff --git a/docs/sphinx/user_guide/index.rst b/docs/sphinx/user_guide/index.rst index fef1c354a1..25eaf980f9 100644 --- a/docs/sphinx/user_guide/index.rst +++ b/docs/sphinx/user_guide/index.rst @@ -13,8 +13,8 @@ RAJA User Guide RAJA is a software library of C++ abstractions, developed at Lawrence Livermore National Laboratory (LLNL), that enable architecture and programming model -portability for high performance computing (HPC) applications. RAJA has two -main goals: +portability for high performance computing (HPC) applications. RAJA has two +main goals: #. To enable application portability with manageable disruption to existing algorithms and programming styles. #. To achieve performance comparable to using common programming models (e.g., OpenMP, CUDA, etc.) directly. @@ -26,7 +26,7 @@ that extend the generally-accepted *parallel for* idiom. Background and Motivation ============================= -Many HPC applications must achieve high performance across a diverse +Many HPC applications must achieve high performance across a diverse range of computer architectures including: Mac and Windows laptops, parallel clusters of multicore commodity processors, and large-scale supercomputers with advanced heterogeneous node architectures that combine @@ -39,37 +39,47 @@ have been made in highly-scalable MPI-only applications that have been in service over multiple platform generations. Often, maintaining developer and user productivity requires the ability to build single-source application source code bases that can be readily ported to new architectures. RAJA is -one C++-based programming model abstraction layer that can help to meet this -performance portability challenge. +one C++ abstraction layer that helps address this performance portability +challenge. RAJA provides portable abstractions for simple and complex loops -- as well -as a variety of loop transformations, reductions, scans, atomic operations, -data layouts and views, iteration spaces, etc. Currently available execution -patterns supported by different programming model back-ends include: -sequential, -`SIMD `_, -`NVIDIA CUDA `_, -`OpenMP `_ CPU multi-threading and target offload. -OpenMP target offload support is incomplete and should be considered -experimental. Support for `Intel Threading Building Blocks (TBB) `_ and `AMD HIP `_ support are also under development. +reductions, scans, atomic operations, sorts, data layouts, views, and loop +iteration spaces, as well as compile-time loop transformations. Features +are continually growing as new use cases arise due to expanding user adoption. -RAJA uses standard C++11 -- C++ is the predominant programming language in -many LLNL applications. RAJA requirements and design are rooted in a +RAJA uses standard C++11 -- C++ is the programming language model of choice +for many HPC applications. RAJA requirements and design are rooted in a decades of developer experience working on production mesh-based -multiphysics applications at LLNL. An important RAJA requirement is that -application developers can specialize RAJA concepts for different code -implementation patterns and C++ usage, since data structures and algorithms +multiphysics applications. An important RAJA requirement is that +application developers can specialize RAJA concepts for different code +implementation patterns and C++ usage, since data structures and algorithms vary widely across applications. RAJA helps developers insulate application loop kernels from underlying architecture and programming model-specific implementation details. Loop bodies and loop execution are decoupled using C++ lambda expressions (loop bodies) and C++ templates (loop execution methods). This approach -promotes the perspective that developers should focus on tuning +promotes the perspective that application developers should focus on tuning loop patterns rather than individual loops as much as possible. RAJA makes it relatively straightforward to parameterize an application using execution policy types so that it can be compiled in a specific configuration suitable -to a given architecture. +to a given architecture. + +RAJA support for various execution back-ends is the result of collaborative +development between the RAJA team and academic and industrial partners. +Currently available execution back-ends include: +sequential, +`SIMD `_, +`Threading Building Blocks (TBB) `_, +`NVIDIA CUDA `_, +`OpenMP `_ CPU multithreading and target offload, and +`AMD HIP `_. Sequential, +CUDA, OpenMP CPU multithreading, and HIP execution are supported for all +RAJA features. Sequential, OpenMP CPU multithreading, and CUDA +are considered the most developed at this point as these have been our primary +focus up to now. Those back-ends are used in a wide variety of production +applications. OpenMP target offload and TBB back-ends do not support +all RAJA features and should be considered experimental. ================================ Interacting with the RAJA Team @@ -77,29 +87,29 @@ Interacting with the RAJA Team If you are interested in keeping up with RAJA development and communicating with developers and users, please join our `Google Group -`_, or contact the +`_, or contact the development team via email at ``raja-dev@llnl.gov`` If you have questions, find a bug, have ideas about expanding the functionality or applicability, or wish to contribute to RAJA development, please do not hesitate to contact us. We are always -interested in improving RAJA and exploring new ways to use it. A brief -description of how the RAJA team operates can be found in +interested in improving RAJA and exploring new ways to use it. A brief +description of how the RAJA team operates can be found in :ref:`contributing-label`. ============================= What's In This Guide? ============================= -If you have some familiarity with RAJA and want to get up and running quickly, -check out :ref:`getting_started-label`. This guide contains information +If you have some familiarity with RAJA and want to get up and running quickly, +check out :ref:`getting_started-label`. This guide contains information about accessing the RAJA code, building it, and basic RAJA usage. If you are completely new to RAJA, please check out the :ref:`tutorial-label`. -It contains a discussion of essential C++ concepts and will walk you +It contains a discussion of essential C++ concepts and will walk you through a sequence of code examples that show how to use key RAJA features. -See :ref:`features-label` for a complete, high-level description of RAJA +See :ref:`features-label` for a complete, high-level description of RAJA features (like a reference guide). Additional information about things to think about when considering whether @@ -116,4 +126,5 @@ to use RAJA in an application can be found in :ref:`app-considerations-label`. config_options plugins contributing + developer_guide raja_license diff --git a/docs/sphinx/user_guide/plugins.rst b/docs/sphinx/user_guide/plugins.rst index df603e9702..dabe14acb5 100644 --- a/docs/sphinx/user_guide/plugins.rst +++ b/docs/sphinx/user_guide/plugins.rst @@ -29,7 +29,7 @@ that kernel executes, CHAI will make the data available. To build CHAI with RAJA integration, you need to download and install CHAI with the ``ENABLE_RAJA_PLUGIN`` option turned on. Please see the `CHAI project -` for details +`_ for details After CHAI has been build with RAJA support enabled, applications can use CHAI ``ManangedArray`` objects to access data inside a RAJA kernel; for example,:: diff --git a/docs/sphinx/user_guide/tutorial.rst b/docs/sphinx/user_guide/tutorial.rst index 9c59792023..6b9d929ebc 100644 --- a/docs/sphinx/user_guide/tutorial.rst +++ b/docs/sphinx/user_guide/tutorial.rst @@ -14,7 +14,9 @@ RAJA Tutorial This RAJA tutorial introduces RAJA concepts and capabilities via a sequence of examples of increasing complexity. Complete working codes for -the examples are located in the ``RAJA``examples`` directory. +the examples are located in the ``RAJA``examples`` directory. The RAJA +tutorial evolves as we add new features to RAJA, so refer to it periodically +if you are interested in learning about them. To understand the discussion and code examples, a working knowledge of C++ templates and lambda expressions is required. So, before we begin, we provide @@ -27,27 +29,26 @@ transfers between those memory spaces work. For a detailed discussion, see `Device Memory `_. RAJA does not provide a memory model. This is by design as developers of many -of the production applications for which RAJA is targeted prefer to manage -memory themselves. Thus, users are responsible for ensuring that data is -properly allocated and initialized on a GPU device when running GPU code. -This can be done using explicit host and device allocation and copying -between host and device memory spaces or via CUDA unified memory (UM), if -available. RAJA developers also support a library called -`CHAI `_ which complements RAJA by providing -a simple alternative to manual CUDA calls or UM. For more -information, see :ref:`plugins-label`. +of applications that use RAJA prefer to manage memory themselves. Thus, users +are responsible for ensuring that data is properly allocated and initialized +on a GPU device when running GPU code. This can be done using explicit host +and device allocation and copying between host and device memory spaces or via +unified memory (UM), if available. RAJA developers also support a library +called `CHAI `_ which complements RAJA by +providing a alternative to manual host-device memory copy calls or UM. +For more information, see :ref:`plugins-label`. .. _tutorial-lambda-label: =============================== -A Little C++ Lambda Background +A Little C++ Background =============================== RAJA makes heavy use of C++ templates and using RAJA most easily and effectively is done by representing the bodies of loop kernels as C++ lambda -expressions. Alternatively, C++ functors can be used, but we don't recommend -them as they make application source code more complex, potentially placing -a significant negative burden on source code readability and maintainability. +expressions. Alternatively, C++ factors can be used, but they make +application source code more complex, potentially placing a significant +negative burden on source code readability and maintainability. ----------------------------------- C++ Templates @@ -65,15 +66,16 @@ template method defined as:: ... } -Here, "ExecPol", "IdxType", "LoopBody" are C++ types you, as a user, specify at -compile-time, like this:: +Here, "ExecPol", "IdxType", and "LoopBody" are C++ types a user specifies in +their code; for example:: - forall< RAJA::seq_exec >( RAJA::RangeSegment(0, N), [=](int i) { + RAJA::forall< RAJA::seq_exec >( RAJA::RangeSegment(0, N), [=](int i) { a[i] = b[i] + c[i]; }); -The "IdxTypes" and "LoopBody" types are deduced by the compiler based on what -you specify. Here, the loop body type is defined by the lambda expression:: +The "IdxType" and "LoopBody" types are deduced by the compiler based on what +arguments are passed to the ``RAJA::forall`` method. Here, the loop body type +is defined by the lambda expression:: [=](int i) { a[i] = b[i] + c[i]; } @@ -86,8 +88,8 @@ expressions. A more technical and detailed discussion is available here: `Lambda Functions in C++11 - the Definitive Guide `_ Lambda expressions were introduced in C++ 11 to provide a lexical-scoped -name binding; that is, a *closure* that stores a function with a data -environment. In particular, a lambda expression can *capture* variables from an +name binding; specifically, a *closure* that stores a function with a data +environment. That is, a lambda expression can *capture* variables from an enclosing scope for use within the local scope of the function expression. A C++ lambda expression has the following form:: @@ -97,16 +99,16 @@ A C++ lambda expression has the following form:: The ``capture list`` specifies how variables outside the lambda scope are pulled into the lambda data environment. The ``parameter list`` defines arguments passed to the lambda function body -- for the most part, lambda arguments -are just like arguments to a standard C++ method. Variables in the capture list +are just like arguments in a regular C++ method. Variables in the capture list are initialized when the lambda expression is created, while those in the parameter list are set when the lambda expression is called. The body of a lambda expression is similar to the body of an ordinary C++ method. RAJA templates, such as ``RAJA::forall`` and ``RAJA::kernel`` pass arguments -to lambdas based on usage and context; typically, these are loop indices. +to lambdas based on usage and context; e.g., loop iteration indices. A C++ lambda expression can capture variables in the capture list by value or by reference. This is similar to how arguments to C++ methods are passed; -e.g., pass-by-reference or pass-by-value. However, there are some subtle +i.e., *pass-by-reference* or *pass-by-value*. However, there are some subtle differences between lambda variable capture rules and those for ordinary methods. Variables mentioned in the capture list with no extra symbols are captured by value. Capture-by-reference is accomplished by using the @@ -128,15 +130,16 @@ or:: Note that the following two attempts will generate compilation errors:: - [=](){ x = y; }; // capture all lambda arguments by value... - [x, &y](){ x = y; }; // capture 'x' by value and 'y' by reference... + [=](){ x = y; }; // error: all lambda arguments captured by value, + // so cannot assign to 'x'. + [x, &y](){ x = y; }; // error: cannot assign to 'x' since it is captured + // by value. -Specifically, it is illegal to assign a value to a variable 'x' that is -captured by value since it is `read-only`. +**Specifically, a variable hat is captured by value is read-only.** ------------------------------------ -Notes About C++ Lambdas ------------------------------------ +---------------------------------------- +A Few Notes About Lambda Usage With RAJA +---------------------------------------- There are several issues to note about C++ lambda expressions; in particular, with respect to RAJA usage. We describe them here. @@ -199,13 +202,14 @@ with respect to RAJA usage. We describe them here. } ); - * **Local stack arrays are not captured by CUDA device lambdas.** + * **Local stack arrays may not be captured by CUDA device lambdas.** - Although this is inconsistent with the C++ standard, attempting to access - elements in a local stack array in a CUDA device lambda may generate a - compilation error depending on the version of the nvcc compiler you are - using. One solution to this problem is to wrap the array in a - struct; for example:: + Although this is inconsistent with the C++ standard (local stack arrays + are properly captured in lambdas for code that will execute on a CPU), + attempting to access elements in a local stack array in a CUDA device + lambda may generate a compilation error depending on the version of the + nvcc compiler you are using. One solution to this problem is to wrap the + array in a struct; for example:: struct array_wrapper { int[4] array; @@ -217,8 +221,8 @@ with respect to RAJA usage. We describe them here. // access entries of bounds.array } ); - This issue appears to be resolved in in the 10.1 release of the nvcc - compiler. If you are using an earlier version of nvcc, an implementation + This issue appears to be resolved in in the 10.1 release of CUDA. If you + are using an earlier version of nvcc, an implementation similar to the one above will be required. @@ -226,17 +230,17 @@ with respect to RAJA usage. We describe them here. RAJA Examples ================ -The remainder of this tutorial illustrates how to use RAJA features using -various working code examples that are located in the ``RAJA/examples`` +The remainder of this tutorial illustrates how to use RAJA features with +working code examples that are located in the ``RAJA/examples`` directory. Additional information about the RAJA features used can be found in :ref:`features-label`. The examples demonstrate CPU execution (sequential, SIMD, OpenMP -multi-threading) and CUDA GPU execution. Examples that show how to use +multithreading) and CUDA GPU execution. Examples that show how to use RAJA with other parallel programming model back-ends that are in -development will appear when we feel RAJA support for them is sufficiently -complete and robust. For adventurous users who wish to try experimental -features, usage is similar to what is shown in the examples here. +development will appear in future RAJA releases. For adventurous users who +wish to try experimental features, usage is similar to what is shown in the +examples here. All RAJA programming model support features are enabled via CMake options, which are described in :ref:`configopt-label`. @@ -253,7 +257,7 @@ Simple Loops and Basic RAJA Features The examples in this section illustrate how to use ``RAJA::forall`` methods to execute simple loop kernels; i.e., non-nested loops. It also describes -iteration spaces, reductions, atomic operations, and scans. +iteration spaces, reductions, atomic operations, scans, and sorts. .. toctree:: :maxdepth: 1 @@ -265,6 +269,7 @@ iteration spaces, reductions, atomic operations, and scans. tutorial/reductions.rst tutorial/atomic_histogram.rst tutorial/scan.rst + tutorial/sort.rst .. _tutorialcomplex-label: @@ -286,3 +291,4 @@ tiling mechanisms to transform loop patterns. tutorial/offset-layout.rst tutorial/tiled_matrix_transpose.rst tutorial/matrix_transpose_local_array.rst + tutorial/halo-exchange.rst diff --git a/docs/sphinx/user_guide/tutorial/add_vectors.rst b/docs/sphinx/user_guide/tutorial/add_vectors.rst index 891258758d..c9ec806645 100644 --- a/docs/sphinx/user_guide/tutorial/add_vectors.rst +++ b/docs/sphinx/user_guide/tutorial/add_vectors.rst @@ -73,7 +73,7 @@ This policy allows the compiler to generate optimizations, such as SIMD if compiler heuristics suggest that it is safe to do so and potentially beneficial for performance, but the optimizations are not forced. -To run the kernel with OpenMP multi-threaded parallelism on a CPU, we use the +To run the kernel with OpenMP multithreaded parallelism on a CPU, we use the ``RAJA::omp_parallel_for_exec`` execution policy: .. literalinclude:: ../../../../examples/tut_add-vectors.cpp diff --git a/docs/sphinx/user_guide/tutorial/dot_product.rst b/docs/sphinx/user_guide/tutorial/dot_product.rst index 5e8d328e4f..ae69eb0f55 100644 --- a/docs/sphinx/user_guide/tutorial/dot_product.rst +++ b/docs/sphinx/user_guide/tutorial/dot_product.rst @@ -65,7 +65,7 @@ a reduction value type (i.e., 'double'). An initial value of zero for the sum is passed to the reduction object constructor. After the kernel executes, we use the 'get' method to retrieve the reduced value. -The OpenMP multi-threaded variant of the loop is implemented similarly: +The OpenMP multithreaded variant of the loop is implemented similarly: .. literalinclude:: ../../../../examples/tut_dot-product.cpp :start-after: _rajaomp_dotprod_start diff --git a/docs/sphinx/user_guide/tutorial/halo-exchange.rst b/docs/sphinx/user_guide/tutorial/halo-exchange.rst new file mode 100644 index 0000000000..f1460f964b --- /dev/null +++ b/docs/sphinx/user_guide/tutorial/halo-exchange.rst @@ -0,0 +1,230 @@ +.. ## +.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +.. ## and RAJA project contributors. See the RAJA/COPYRIGHT file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _halo_exchange-label: + +------------------------------------ +Halo Exchange (Workgroup Constructs) +------------------------------------ + +Key RAJA features shown in this example: + + * ``RAJA::WorkPool`` workgroup construct + * ``RAJA::WorkGroup`` workgroup construct + * ``RAJA::WorkSite`` workgroup construct + * ``RAJA::RangeSegment`` iteration space construct + * RAJA workgroup policies + +In this example, we show how to use the RAJA workgroup constructs to implement +halo exchange packing and unpacking. This may not be speedup halo exchange on +CPUs but can significantly speedup halo exchange on GPUs compared to using +``RAJA::forall`` to run individual kernels. + +.. note:: Using an abstraction layer over RAJA can make it easy to switch + between using individual ``RAJA::forall`` loops or the RAJA workgroup + constructs to implement halo exchange packing and unpacking at + compile time or run time. + +We start by setting the parameters for the halo exchange by using the default +values or parsing the command line input. These parameters determine the size +of the mesh, the width of the halo, the number of variables and the number of +cycles. + +.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp + :start-after: _halo_exchange_input_params_start + :end-before: _halo_exchange_input_params_end + :language: C++ + +Next we allocate the variables array (the memory manager in +the example uses CUDA Unified Memory if CUDA is enabled). These grid variables +will be reset each cycle to allow checking the results of the packing and +unpacking. + +.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp + :start-after: _halo_exchange_vars_allocate_start + :end-before: _halo_exchange_vars_allocate_end + :language: C++ + +We also allocate and initialize index lists of the grid elements to pack and +unpack: + +.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp + :start-after: _halo_exchange_index_list_generate_start + :end-before: _halo_exchange_index_list_generate_end + :language: C++ + +All the code examples presented below copy the data packed from just inside +the mesh variable: + + +---+---+---+---+---+ + | 0 | 0 | 0 | 0 | 0 | + +---+---+---+---+---+ + | 0 | 1 | 2 | 3 | 0 | + +---+---+---+---+---+ + | 0 | 4 | 5 | 6 | 0 | + +---+---+---+---+---+ + | 0 | 7 | 8 | 9 | 0 | + +---+---+---+---+---+ + | 0 | 0 | 0 | 0 | 0 | + +---+---+---+---+---+ + +into the adjacent halo: + + +---+---+---+---+---+ + | 1 | 1 | 2 | 3 | 3 | + +---+---+---+---+---+ + | 1 | 1 | 2 | 3 | 3 | + +---+---+---+---+---+ + | 4 | 4 | 5 | 6 | 6 | + +---+---+---+---+---+ + | 7 | 7 | 8 | 9 | 9 | + +---+---+---+---+---+ + | 7 | 7 | 8 | 9 | 9 | + +---+---+---+---+---+ + + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Packing and Unpacking (Basic Loop Execution) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A sequential non-RAJA example of packing: + +.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp + :start-after: _halo_exchange_sequential_cstyle_packing_start + :end-before: _halo_exchange_sequential_cstyle_packing_end + :language: C++ + +and unpacking: + +.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp + :start-after: _halo_exchange_sequential_cstyle_unpacking_start + :end-before: _halo_exchange_sequential_cstyle_unpacking_end + :language: C++ + + +^^^^^^^^^^^^^^^^^^^^^^^^^^ +RAJA Variants using forall +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A sequential RAJA example using these policies and types: + +.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp + :start-after: _halo_exchange_loop_forall_policies_start + :end-before: _halo_exchange_loop_forall_policies_end + :language: C++ + +of packing: + +.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp + :start-after: _halo_exchange_loop_forall_packing_start + :end-before: _halo_exchange_loop_forall_packing_end + :language: C++ + +and unpacking: + +.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp + :start-after: _halo_exchange_loop_forall_unpacking_start + :end-before: _halo_exchange_loop_forall_unpacking_end + :language: C++ + + +For parallel multi-threading execution via OpenMP, the example can be run +by replacing the execution policy with: + +.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp + :start-after: _halo_exchange_openmp_forall_policies_start + :end-before: _halo_exchange_openmp_forall_policies_end + :language: C++ + +Similarly, to run the loops in parallel on a CUDA GPU use this policies: + +.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp + :start-after: _halo_exchange_cuda_forall_policies_start + :end-before: _halo_exchange_cuda_forall_policies_end + :language: C++ + + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +RAJA Variants using workgroup constructs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Using the workgroup constructs in the example requires defining a few more +policies and types: + +.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp + :start-after: _halo_exchange_loop_workgroup_policies_start + :end-before: _halo_exchange_loop_workgroup_policies_end + :language: C++ + +which are used in a slightly rearranged version of packing. See how the comment +indicating where a message could be sent has been moved down after the call to +run on the workgroup: + +.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp + :start-after: _halo_exchange_loop_workgroup_packing_start + :end-before: _halo_exchange_loop_workgroup_packing_end + :language: C++ + +Similarly in the unpacking we wait to receive all of the messages before +unpacking is done: + +.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp + :start-after: _halo_exchange_loop_workgroup_unpacking_start + :end-before: _halo_exchange_loop_workgroup_unpacking_end + :language: C++ + +This reorganization has the downside of not overlapping the message sends with +packing and the message receives with unpacking. + +For parallel multi-threading execution via OpenMP, the example using workgroup +can be run by replacing the policies and types with: + +.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp + :start-after: _halo_exchange_openmp_workgroup_policies_start + :end-before: _halo_exchange_openmp_workgroup_policies_end + :language: C++ + +Similarly, to run the loops in parallel on a CUDA GPU use these policies and +types, taking note of the unordered work ordering policy that allows the +enqueued loops to all be run using a single cuda kernel: + +.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp + :start-after: _halo_exchange_cuda_workgroup_policies_start + :end-before: _halo_exchange_cuda_workgroup_policies_end + :language: C++ + +The packing is the same as the previous workgroup packing examples with the +exception of added synchronization after calling run and before sending the +messages. The previous cuda example used forall to launch +``num_neighbors * num_vars`` cuda kernels and performed ``num_neighbors`` +synchronizations to send each message in turn. Here the reorganization to pack +all messages before sending lets us use an unordered cuda work ordering policy +in the workgroup constructs that reduces the number of cuda kernel launches to +one. It also allows us to synchronize once before sending all of the messages: + +.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp + :start-after: _halo_exchange_cuda_workgroup_packing_start + :end-before: _halo_exchange_cuda_workgroup_packing_end + :language: C++ + +After waiting to receive all of the messages we use workgroup constructs using +a cuda unordered work ordering policy to unpack all of the messages using a +single kernel launch: + +.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp + :start-after: _halo_exchange_cuda_workgroup_unpacking_start + :end-before: _halo_exchange_cuda_workgroup_unpacking_end + :language: C++ + +Note that the synchronization after unpacking is done to ensure that +``group_unpack`` and ``site_unpack`` survive until the unpacking loop has +finished executing. + + +The file ``RAJA/examples/tut_halo-exchange.cpp`` contains the complete +working example code. diff --git a/docs/sphinx/user_guide/tutorial/indexset_segments.rst b/docs/sphinx/user_guide/tutorial/indexset_segments.rst index cf903a1570..febdf4403d 100644 --- a/docs/sphinx/user_guide/tutorial/indexset_segments.rst +++ b/docs/sphinx/user_guide/tutorial/indexset_segments.rst @@ -168,7 +168,7 @@ policy as before. Before we end the discussion of these examples, we demonstrate a few more index set execution policy variations. To run the previous three segment code by iterating over the segments sequentially and executing each -segment in parallel using OpenMP multi-threading, we would use this policy +segment in parallel using OpenMP multithreading, we would use this policy definition: .. literalinclude:: ../../../../examples/tut_indexset-segments.cpp diff --git a/docs/sphinx/user_guide/tutorial/matrix_multiply.rst b/docs/sphinx/user_guide/tutorial/matrix_multiply.rst index f0bdd84fcc..4b3f09c58e 100644 --- a/docs/sphinx/user_guide/tutorial/matrix_multiply.rst +++ b/docs/sphinx/user_guide/tutorial/matrix_multiply.rst @@ -170,7 +170,7 @@ reorder for-statements for each loop nest level. These execution patterns and transformations can be achieved by changing only the policy and leaving the loop kernel code as is. -If we want to execute the row loop using OpenMP multi-threaded parallelism +If we want to execute the row loop using OpenMP multithreaded parallelism and keep the column loop sequential, the policy we would use is: .. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp @@ -242,8 +242,8 @@ to specify which arguments each lambda takes and in which order. For example: By using ``RAJA::statement::Lambda`` parameters in this way, the code potentially indicates more clearly which areguments are used. Of course, this makes the execution policy more verbose, but that is typically hidden away -in a header file. Statements such as ``RAJA::statement::Segs``, and -``RAJA::statement::Params`` identify the positions of the segments and params +in a header file. Statements such as ``RAJA::Segs``, and +``RAJA::Params`` identify the positions of the segments and params in the tuples to be used as arguments to the lambda expressions. As we noted earlier, the execution policy type passed to the diff --git a/docs/sphinx/user_guide/tutorial/matrix_transpose_local_array.rst b/docs/sphinx/user_guide/tutorial/matrix_transpose_local_array.rst index f36244c8d6..7dd78085c4 100644 --- a/docs/sphinx/user_guide/tutorial/matrix_transpose_local_array.rst +++ b/docs/sphinx/user_guide/tutorial/matrix_transpose_local_array.rst @@ -116,7 +116,7 @@ kernel is: The ``RAJA::statement::Tile`` types in the execution policy define tiling of the outer 'row' (iteration space tuple index '1') and 'col' (iteration space tuple index '0') loops, including tile sizes -(``RAJA::statement::tile_fixed`` types) and loop execution policies. Next, +(``RAJA::tile_fixed`` types) and loop execution policies. Next, the ``RAJA::statement::InitLocalMem`` type initializes the local stack array based on the memory policy type (here, we use ``RAJA::cpu_tile_mem`` for a CPU stack-allocated array). The ``RAJA::ParamList<2>`` parameter indicates @@ -182,7 +182,7 @@ execution policy and kernel: :language: C++ Here, the two ``RAJA::statement::Lambda`` types in the execution policy show -two different ways to specify the segments (``RAJA::statement::Segs``) +two different ways to specify the segments (``RAJA::Segs``) associated with the matrix column and row indices. That is, we can use a ``Segs`` statement for each argument, or include multiple segment ids in one statement. @@ -191,7 +191,7 @@ Note that we are using ``RAJA::statement::For`` types for the inner tile loops instead of `RAJA::statement::ForICount`` types used in the first variant. As a consequence of specifying lambda arguments, there are two main differences. The local tile indices are properly computed and passed to the lambda -expressions as a result of the ``RAJA::statement::Offsets`` types that appear +expressions as a result of the ``RAJA::Offsets`` types that appear in the lambda statement types. The ``RAJA::statement::Lambda`` type for each lambda shows the two ways to specify the local tile index args; we can use an ``Offsets`` statement for each argument, or include multiple segment ids in one diff --git a/docs/sphinx/user_guide/tutorial/nested_loop_reorder.rst b/docs/sphinx/user_guide/tutorial/nested_loop_reorder.rst index def7d1b9ad..7165d8d6dd 100644 --- a/docs/sphinx/user_guide/tutorial/nested_loop_reorder.rst +++ b/docs/sphinx/user_guide/tutorial/nested_loop_reorder.rst @@ -103,7 +103,7 @@ for-loops, the loop would appear as:: for (int k = 2; k< 4; ++k) { for (int j = 1; j < 3; ++j) { - for (int i = 0; j < 2; ++i) { + for (int i = 0; i < 2; ++i) { // print loop index triple... } } diff --git a/docs/sphinx/user_guide/tutorial/reductions.rst b/docs/sphinx/user_guide/tutorial/reductions.rst index a65b7e94d9..5fbcdfab59 100644 --- a/docs/sphinx/user_guide/tutorial/reductions.rst +++ b/docs/sphinx/user_guide/tutorial/reductions.rst @@ -70,7 +70,7 @@ object is retrieved after the kernel by calling a 'get()' method on the reduction object. The min-loc/max-loc index values are obtained using 'getLoc()' methods. -For parallel multi-threading execution via OpenMP, the example can be run +For parallel multithreading execution via OpenMP, the example can be run by replacing the execution and reduction policies with: .. literalinclude:: ../../../../examples/tut_reductions.cpp diff --git a/docs/sphinx/user_guide/tutorial/scan.rst b/docs/sphinx/user_guide/tutorial/scan.rst index db264781db..c5eec8d9d1 100644 --- a/docs/sphinx/user_guide/tutorial/scan.rst +++ b/docs/sphinx/user_guide/tutorial/scan.rst @@ -69,7 +69,7 @@ We can be explicit about the operation used in the scan by passing the The result in the 'out' array is the same. -An inclusive parallel scan operation using OpenMP multi-threading is +An inclusive parallel scan operation using OpenMP multithreading is accomplished similarly by replacing the execution policy type: .. literalinclude:: ../../../../examples/tut_scan.cpp diff --git a/docs/sphinx/user_guide/tutorial/sort.rst b/docs/sphinx/user_guide/tutorial/sort.rst new file mode 100644 index 0000000000..27ac6310a1 --- /dev/null +++ b/docs/sphinx/user_guide/tutorial/sort.rst @@ -0,0 +1,204 @@ +.. ## +.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +.. ## and RAJA project contributors. See the RAJA/COPYRIGHT file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _sort-label: + +-------------------------------------------------- +Parallel Sort Operations +-------------------------------------------------- + +Key RAJA features shown in this section: + + * ``RAJA::sort`` operation + * ``RAJA::sort_pairs`` operation + * ``RAJA::stable_sort`` operation + * ``RAJA::stable_sort_pairs`` operation + * RAJA comparators for different types of sorts; e.g., less, greater + +Below, we present examples of RAJA sequential, OpenMP, +and CUDA sort operations and show how different sort orderings can be +achieved by passing different RAJA comparators to the RAJA sort template +methods. Each comparator is a template type, where the template argument is +the type of the values it compares. For a summary of RAJA sort +functionality, please see :ref:`sort-label`. + +.. note:: RAJA sort operations use the same execution policy types that + ``RAJA::forall`` loop execution templates do. + +Each of the examples below uses the same integer arrays for input +and output values. We set the input array and print them as follows: + +.. literalinclude:: ../../../../examples/tut_sort.cpp + :start-after: _sort_array_init_start + :end-before: _sort_array_init_end + :language: C++ + +This generates the following sequence of values in the ``in`` array:: + + 6 7 2 1 0 9 4 8 5 3 4 9 6 3 7 0 1 8 2 5 + +This generates the following sequence of values in the ``in`` and ``in_vals`` +arrays:: + + (6,0) (7,0) (2,0) (1,0) (0,0) (9,0) (4,0) (8,0) (5,0) (3,0) + (4,1) (9,1) (6,1) (3,1) (7,1) (0,1) (1,1) (8,1) (2,1) (5,1) + +^^^^^^^^^^^^^^^^ +Unstable Sorts +^^^^^^^^^^^^^^^^ + +A sequential unstable sort operation is performed by: + +.. literalinclude:: ../../../../examples/tut_sort.cpp + :start-after: _sort_seq_start + :end-before: _sort_seq_end + :language: C++ + +Since no comparator is passed to the sort method, the default less operation +is applied and the result generated in the ``out`` array is non-decreasing sort +on the ``out`` array. The resulting ``out`` array contains the values:: + + 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 + +We can be explicit about the operation used in the sort by passing the +less operator to the sort method: + +.. literalinclude:: ../../../../examples/tut_sort.cpp + :start-after: _sort_seq_less_start + :end-before: _sort_seq_less_end + :language: C++ + +The result in the ``out`` array is the same. + +An unstable parallel sort operation using OpenMP multi-threading is +accomplished similarly by replacing the execution policy type: + +.. literalinclude:: ../../../../examples/tut_sort.cpp + :start-after: _sort_omp_less_start + :end-before: _sort_omp_less_end + :language: C++ + +As is commonly done with RAJA, the only difference between this code and +the previous one is that the execution policy is different. If we want to +run the sort on a GPU using CUDA, we would use a CUDA execution policy. This +will be shown shortly. + +^^^^^^^^^^^^^^^^ +Stable Sorts +^^^^^^^^^^^^^^^^ + +A sequential stable sort (less) operation is performed by: + +.. literalinclude:: ../../../../examples/tut_sort.cpp + :start-after: _sort_stable_seq_less_start + :end-before: _sort_stable_seq_less_end + :language: C++ + +This generates the following sequence of values in the output array:: + + 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 + +Note that the stable sort result is the same as the unstable sort in this case +because we are sorting integers. We will show an example of sorting pairs later +where this is not the case. + +Running the same sort operation on a GPU using CUDA is done by: + +.. literalinclude:: ../../../../examples/tut_sort.cpp + :start-after: _sort_stable_cuda_less_start + :end-before: _sort_stable_cuda_less_end + :language: C++ + +Note that we pass the number of threads per CUDA thread block as the template +argument to the CUDA execution policy as we do in other cases. + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Other Comparators +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Using a different comparator allows sorting in a different order. +Here is a sequential stable sort that uses the greater operator: + +.. literalinclude:: ../../../../examples/tut_sort.cpp + :start-after: _sort_stable_seq_greater_start + :end-before: _sort_stable_seq_greater_end + :language: C++ + +This generates the following sequence of values in non-increasing order in +the output array:: + + 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 + +Note that the only operators provided by RAJA that are valid to use in sort +because they form a strict weak ordering of elements for arithmetic types are +less and greater. Also note that the the cuda sort backend only supports +RAJA's operators less and greater. + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Sort Pairs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Sort *Pairs* operations generate the same results as the sort operations +we have just described. However, an additional array of values is also permuted +to match the sorted array so **two arrays are passed to sort pairs methods.** + +Here is a sequential unstable sort pairs that uses the less operator: + +.. literalinclude:: ../../../../examples/tut_sort.cpp + :start-after: _sort_pairs_seq_less_start + :end-before: _sort_pairs_seq_less_end + :language: C++ + +This generates the following sequence in the output array:: + + (0,0) (0,1) (1,0) (1,1) (2,0) (2,1) (3,0) (3,1) (4,0) (4,1) + (5,1) (5,0) (6,1) (6,0) (7,0) (7,1) (8,0) (8,1) (9,1) (9,0) + +Note that some of the pairs with equivalent keys stayed in the same order +they appeared in the unsorted arrays like ``(8,0) (8,1)``, while others are +reversed like ``(9,1) (9,0)``. + +Here is a sequential stable sort pairs that uses the greater operator: + +.. literalinclude:: ../../../../examples/tut_sort.cpp + :start-after: _sort_stable_pairs_seq_greater_start + :end-before: _sort_stable_pairs_seq_greater_end + :language: C++ + +This generates the following sequence in the output array:: + + (9,0) (9,1) (8,0) (8,1) (7,0) (7,1) (6,0) (6,1) (5,0) (5,1) + (4,0) (4,1) (3,0) (3,1) (2,0) (2,1) (1,0) (1,1) (0,0) (0,1) + +Note that all pairs with equivalent keys stayed in the same order that they +appeared in the unsorted arrays. + +As you may expect at this point, running an stable sort pairs +operation using OpenMP is accomplished by: + +.. literalinclude:: ../../../../examples/tut_sort.cpp + :start-after: _sort_stable_pairs_omp_greater_start + :end-before: _sort_stable_pairs_omp_greater_start + :language: C++ + +This generates the following sequence in the output array (as we saw earlier):: + + (9,0) (9,1) (8,0) (8,1) (7,0) (7,1) (6,0) (6,1) (5,0) (5,1) + (4,0) (4,1) (3,0) (3,1) (2,0) (2,1) (1,0) (1,1) (0,0) (0,1) + +and the only difference is the execution policy template parameter. + +Lastly, we show a parallel unstable sort pairs operation using CUDA: + +.. literalinclude:: ../../../../examples/tut_sort.cpp + :start-after: _sort_pairs_cuda_greater_start + :end-before: _sort_pairs_cuda_greater_start + :language: C++ + +The file ``RAJA/examples/tut_sort.cpp`` contains the complete +working example code. diff --git a/docs/sphinx/user_guide/tutorial/tiled_matrix_transpose.rst b/docs/sphinx/user_guide/tutorial/tiled_matrix_transpose.rst index 382023bd81..5707a2a2e9 100644 --- a/docs/sphinx/user_guide/tutorial/tiled_matrix_transpose.rst +++ b/docs/sphinx/user_guide/tutorial/tiled_matrix_transpose.rst @@ -60,7 +60,7 @@ RAJA::kernel Variants ^^^^^^^^^^^^^^^^^^^^^ For ``RAJA::kernel`` variants, we use ``RAJA::statement::Tile`` types -for the outer loop tiling and ``RAJA::statement::tile_fixed`` types to +for the outer loop tiling and ``RAJA::tile_fixed`` types to indicate the tile dimensions. The complete sequential RAJA variant is: .. literalinclude:: ../../../../examples/tut_tiled-matrix-transpose.cpp diff --git a/docs/sphinx/user_guide/tutorial/vertexsum_coloring.rst b/docs/sphinx/user_guide/tutorial/vertexsum_coloring.rst index 4b6ecd1f3b..f45fdbc6cf 100644 --- a/docs/sphinx/user_guide/tutorial/vertexsum_coloring.rst +++ b/docs/sphinx/user_guide/tutorial/vertexsum_coloring.rst @@ -102,7 +102,7 @@ using the vectors: Now, we can use an index set execution policy that iterates over the segments sequentially and executes each segment in parallel using OpenMP -multi-threading (and ``RAJA::forall``): +multithreading (and ``RAJA::forall``): .. literalinclude:: ../../../../examples/tut_vertexsum-coloring.cpp :start-after: _raja_seq_colorindexset_vertexsum_start diff --git a/docs/sphinx/user_guide/using_raja.rst b/docs/sphinx/user_guide/using_raja.rst index 5d04ef5e68..458585186d 100644 --- a/docs/sphinx/user_guide/using_raja.rst +++ b/docs/sphinx/user_guide/using_raja.rst @@ -12,11 +12,12 @@ Using RAJA in Your Application ****************************** -Using RAJA in an application requires two things: ensuring the header files +Using RAJA in an application requires two things: ensuring the RAJA header files are visible, and linking against the RAJA library. We maintain a `RAJA Template Project `_ -shows how to use RAJA in a CMake project, either as a Git submodule or -as an externally installed library that you link your application against. +that shows how to use RAJA in a project that uses CMake or make, either as a +Git submodule or as an externally installed library that you link your +application against. ======================== CMake Configuration File diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 6f5be57599..b488e88050 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -5,6 +5,10 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### +raja_add_executable( + NAME resource-forall + SOURCES resource-forall.cpp) + raja_add_executable( NAME tut_daxpy SOURCES tut_daxpy.cpp) @@ -36,11 +40,15 @@ raja_add_executable( raja_add_executable( NAME tut_reductions SOURCES tut_reductions.cpp) - + raja_add_executable( NAME tut_scan SOURCES tut_scan.cpp) +raja_add_executable( + NAME tut_sort + SOURCES tut_sort.cpp) + raja_add_executable( NAME tut_atomic-histogram SOURCES tut_atomic-histogram.cpp) @@ -61,10 +69,18 @@ raja_add_executable( NAME tut_tiled-matrix-transpose SOURCES tut_tiled-matrix-transpose.cpp) +raja_add_executable( + NAME tut_halo-exchange + SOURCES tut_halo-exchange.cpp) + raja_add_executable( NAME pi-reduce_vs_atomic SOURCES pi-reduce_vs_atomic.cpp) +raja_add_executable( + NAME raja-teams + SOURCES raja-teams.cpp) + raja_add_executable( NAME jacobi SOURCES jacobi.cpp) @@ -76,11 +92,15 @@ raja_add_executable( raja_add_executable( NAME wave-eqn SOURCES wave-eqn.cpp) - + raja_add_executable( NAME ltimes SOURCES ltimes.cpp) +raja_add_executable( + NAME multiview + SOURCES multiview.cpp) + if(ENABLE_TARGET_OPENMP) raja_add_executable( NAME target-kernel @@ -91,4 +111,8 @@ if(ENABLE_TARGET_OPENMP) SOURCES omp-target-ltimes.cpp) endif() +raja_add_executable( + NAME kernel-dynamic-tile + SOURCES kernel-dynamic-tile.cpp) + add_subdirectory(plugin) diff --git a/examples/jacobi.cpp b/examples/jacobi.cpp index a2c43db22b..f391e5a8a1 100644 --- a/examples/jacobi.cpp +++ b/examples/jacobi.cpp @@ -317,8 +317,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using jacobiCUDANestedPolicy = RAJA::KernelPolicy< RAJA::statement::CudaKernel< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed<32>, RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed<32>, RAJA::cuda_block_x_loop, + RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop, + RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop, RAJA::statement::For<1, RAJA::cuda_thread_y_direct, RAJA::statement::For<0, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<0> @@ -394,8 +394,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using jacobiHIPNestedPolicy = RAJA::KernelPolicy< RAJA::statement::HipKernel< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed<32>, RAJA::hip_block_y_loop, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed<32>, RAJA::hip_block_x_loop, + RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop, + RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop, RAJA::statement::For<1, RAJA::hip_thread_y_direct, RAJA::statement::For<0, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<0> diff --git a/examples/kernel-dynamic-tile.cpp b/examples/kernel-dynamic-tile.cpp new file mode 100644 index 0000000000..5de2123425 --- /dev/null +++ b/examples/kernel-dynamic-tile.cpp @@ -0,0 +1,34 @@ +#include "RAJA/RAJA.hpp" + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + std::cout << "\n\nRAJA dynamic_tile example...\n\n"; + +//Using policy = KernelPolicy, seq_exec, …>>; +//RAJA::kernel_param( +// make_tuple(RangeSegment(0,N)), +// make_tuple(32), // param 0 is referenced by tile_dynamic +// [=](int i, int tile_size){ +// +// }); + + using namespace RAJA; + + kernel_param< + KernelPolicy< + statement::Tile<1, tile_dynamic<1>, seq_exec, + statement::Tile<0, tile_dynamic<0>, seq_exec, + statement::For<1, seq_exec, + statement::For<0, seq_exec, statement::Lambda<0>> + > + > + > + > + >(make_tuple(RangeSegment{0,25}, RangeSegment{0,25}), + make_tuple(TileSize{5}, TileSize{10}), + //make_tuple(TileSize(10)), // not sure we need this, good for static_assert + [=](int i, int j, TileSize x, TileSize y){ + std::cout << "Running index (" << i << "," << j << ") of " << x.size << "x" << y.size << " tile." << std::endl; + }); + +} diff --git a/examples/ltimes.cpp b/examples/ltimes.cpp index 3cd769cb50..266859ac20 100644 --- a/examples/ltimes.cpp +++ b/examples/ltimes.cpp @@ -25,6 +25,7 @@ #include #endif + /* * LTimes Example * @@ -276,6 +277,73 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //----------------------------------------------------------------------------// +{ + std::cout << "\n Running RAJA sequential ARGS version of LTimes...\n"; + + std::memset(phi_data, 0, phi_size * sizeof(double)); + + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; + + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; + + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; + + std::array L_perm {{0, 1}}; + LView L(L_data, + RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + + std::array psi_perm {{0, 1, 2}}; + PsiView psi(psi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + + std::array phi_perm {{0, 1, 2}}; + PhiView phi(phi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + + using EXECPOL = + RAJA::KernelPolicy< + statement::For<0, loop_exec, // m + statement::For<1, loop_exec, // d + statement::For<2, loop_exec, // g + statement::For<3, simd_exec, // z + statement::Lambda<0, Segs<0, 1, 2, 3>> + > + > + > + > + >; + + auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::TypedRangeSegment(0, num_d), + RAJA::TypedRangeSegment(0, num_g), + RAJA::TypedRangeSegment(0, num_z)); + + RAJA::Timer timer; + timer.start(); + + RAJA::kernel( segments, + [=] (IM m, ID d, IG g, IZ z) { + phi(m, g, z) += L(m, d) * psi(d, g, z); + } + ); + + timer.stop(); + std::cout << " RAJA sequential ARGS version of LTimes run time (sec.): " + << timer.elapsed() << std::endl; + +#if defined(DEBUG_LTIMES) + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); +#endif +} + +//----------------------------------------------------------------------------// + { std::cout << "\n Running RAJA sequential shmem version of LTimes...\n"; @@ -283,14 +351,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension + // + // L(m, d) : 1 -> d is stride-1 dimension using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension + // psi(d, g, z) : 2 -> z is stride-1 dimension using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension + // phi(m, g, z) : 2 -> z is stride-1 dimension using PhiView = TypedView, IM, IG, IZ>; std::array L_perm {{0, 1}}; @@ -312,54 +380,63 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using RAJA::statement::Param; - using EXECPOL = + using EXECPOL = RAJA::KernelPolicy< // Create memory tiles statement::InitLocalMem, // Tile outer m,d loops - statement::Tile<0, statement::tile_fixed, loop_exec, // m - statement::Tile<1, statement::tile_fixed, loop_exec, // d + statement::Tile<0, tile_fixed, loop_exec, // m + statement::Tile<1, tile_fixed, loop_exec, // d // Load L(m,d) for m,d tile into shmem - statement::ForICount<0, Param<3>, loop_exec, // m - statement::ForICount<1, Param<4>, loop_exec, // d - statement::Lambda<1> + statement::For<0, loop_exec, // m + statement::For<1, loop_exec, // d + statement::Lambda<0, Segs<0, 1>, + Params<0>, + Offsets<0, 1>> > >, // Run inner g, z loops with z loop tiled statement::For<2, loop_exec, // g - statement::Tile<3, statement::tile_fixed, loop_exec, // z + statement::Tile<3, tile_fixed, loop_exec, // z // Load psi into shmem - statement::ForICount<1, Param<4>, loop_exec, // d - statement::ForICount<3, Param<6>, loop_exec, // z - statement::Lambda<2> + statement::For<1, loop_exec, // d + statement::For<3, loop_exec, // z + statement::Lambda<1, Segs<1, 2, 3>, + Params<1>, + Offsets<1, 2, 3>> > >, // Compute phi - statement::ForICount<0, Param<3>, loop_exec, // m + statement::For<0, loop_exec, // m // Load phi into shmem - statement::ForICount<3, Param<6>, loop_exec, // z - statement::Lambda<3> + statement::For<3, loop_exec, // z + statement::Lambda<2, Segs<0, 2, 3>, + Params<2>, + Offsets<0, 2, 3>> >, - // Compute phi in shmem - statement::ForICount<1, Param<4>, loop_exec, // d - statement::ForICount<3, Param<6>, loop_exec, // z - statement::Lambda<4> + // Compute phi in shmem + statement::For<1, loop_exec, // d + statement::For<3, loop_exec, // z + statement::Lambda<3, Params<0, 1, 2>, + Offsets<0, 1, 2, 3>> > >, // Store phi - statement:: ForICount<3, Param<6>, loop_exec, // z - statement::Lambda<5> + statement:: For<3, loop_exec, // z + statement::Lambda<4, Segs<0, 2, 3>, + Params<2>, + Offsets<0, 2, 3>> > > // m @@ -371,7 +448,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > // Tile d > // Tile m > // LocalMemory - >; // KernelPolicy + >; // KernelPolicy @@ -379,31 +456,31 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Define statically dimensioned local arrays used in kernel // - using shmem_L_t = RAJA::TypedLocalArray, IM, ID>; shmem_L_t shmem_L; - using shmem_psi_t = RAJA::TypedLocalArray, ID, IG, IZ>; shmem_psi_t shmem_psi; - - - using shmem_phi_t = RAJA::TypedLocalArray, IM, IG, IZ>; shmem_phi_t shmem_phi; - + RAJA::Timer timer; timer.start(); - RAJA::kernel_param( + RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), RAJA::TypedRangeSegment(0, num_d), @@ -413,57 +490,44 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // They are the last args in all lambdas (after indices). RAJA::make_tuple( shmem_L, shmem_psi, - shmem_phi, - IM(0), - ID(0), - IG(0), - IZ(0) - ), - - // Lambda<0> : Single lambda version - [=] (IM m, ID d, IG g, IZ z, - shmem_L_t&, shmem_psi_t&, shmem_phi_t&, - IM , ID , IG , IZ ) - { - phi(m, g, z) += L(m, d) * psi(d, g, z); - }, + shmem_phi), - // Lambda<1> : Load L into shmem - [=] (IM m, ID d, IG /*g*/, IZ /*z*/, - shmem_L_t& sh_L, shmem_psi_t&, shmem_phi_t&, - IM tm, ID td, IG , IZ ) + + // Lambda<0> : Load L into shmem + [=] (IM m, ID d, + shmem_L_t& sh_L, + IM tm, ID td) { sh_L(tm, td) = L(m, d); }, - // Lambda<2> : Load psi into shmem - [=] (IM /*m*/, ID d, IG g, IZ z, - shmem_L_t&, shmem_psi_t& sh_psi, shmem_phi_t&, - IM , ID td, IG tg, IZ tz) + // Lambda<1> : Load psi into shmem + [=] (ID d, IG g, IZ z, + shmem_psi_t& sh_psi, + ID td, IG tg, IZ tz) { sh_psi(td, tg, tz) = psi(d, g, z); }, - // Lambda<3> : Load phi into shmem - [=] (IM m, ID /*d*/, IG g, IZ z, - shmem_L_t&, shmem_psi_t&, shmem_phi_t& sh_phi, - IM tm, ID , IG tg, IZ tz) + // Lambda<2> : Load phi into shmem + [=] (IM m, IG g, IZ z, + shmem_phi_t& sh_phi, + IM tm, IG tg, IZ tz) { sh_phi(tm, tg, tz) = phi(m, g, z); }, - // Lambda<4> : Compute phi in shmem - [=] (IM , ID , IG , IZ , - shmem_L_t& sh_L, shmem_psi_t& sh_psi, shmem_phi_t& sh_phi, - IM tm, ID td, IG tg, IZ tz) + // Lambda<3> : Compute phi in shmem + [=] (shmem_L_t& sh_L, shmem_psi_t& sh_psi, shmem_phi_t& sh_phi, + IM tm, ID td, IG tg, IZ tz) { sh_phi(tm, tg, tz) += sh_L(tm, td) * sh_psi(td, tg, tz); }, - // Lambda<5> : Store phi - [=] (IM m, ID /*d*/, IG g, IZ z, - shmem_L_t&, shmem_psi_t&, shmem_phi_t& sh_phi, - IM tm, ID , IG tg, IZ tz) + // Lambda<4> : Store phi + [=] (IM m, IG g, IZ z, + shmem_phi_t& sh_phi, + IM tm, IG tg, IZ tz) { phi(m, g, z) = sh_phi(tm, tg, tz); } @@ -478,6 +542,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif } + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -742,50 +807,52 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Define our execution policy // - using RAJA::statement::Param; + using RAJA::Segs; + using RAJA::Params; + using RAJA::Offsets; using EXECPOL = RAJA::KernelPolicy< statement::CudaKernelAsync< statement::InitLocalMem, // Tile outer m,d loops - statement::Tile<0, statement::tile_fixed, seq_exec, // m - statement::Tile<1, statement::tile_fixed, seq_exec, // d + statement::Tile<0, tile_fixed, seq_exec, // m + statement::Tile<1, tile_fixed, seq_exec, // d // Load L for m,d tile into shmem - statement::ForICount<1, Param<4>, cuda_thread_x_loop, // d - statement::ForICount<0, Param<3>, cuda_thread_y_direct, // m - statement::Lambda<0> + statement::For<1, cuda_thread_x_loop, // d + statement::For<0, cuda_thread_y_direct, // m + statement::Lambda<0, Segs<0,1>, Params<0>, Offsets<0,1>> > >, statement::CudaSyncThreads, // Distribute g, z across blocks and tile z statement::For<2, cuda_block_y_loop, // g - statement::Tile<3, statement::tile_fixed, cuda_block_x_loop, // z + statement::Tile<3, tile_fixed, cuda_block_x_loop, // z // Load phi into thread local storage - statement::ForICount<3, Param<6>, cuda_thread_x_direct, // z - statement::ForICount<0, Param<3>, cuda_thread_y_direct, // m - statement::Lambda<2> + statement::For<3, cuda_thread_x_direct, // z + statement::For<0, cuda_thread_y_direct, // m + statement::Lambda<2, Segs<0,2,3>, Params<2>> > >, // Load slice of psi into shmem - statement::ForICount<3, Param<6>, cuda_thread_x_direct, // z - statement::ForICount<1, Param<4>, cuda_thread_y_loop, // d (reusing y) - statement::Lambda<1> + statement::For<3,cuda_thread_x_direct, // z + statement::For<1, cuda_thread_y_loop, // d (reusing y) + statement::Lambda<1, Segs<1,2,3>, Params<1>, Offsets<1,2,3>> > >, statement::CudaSyncThreads, // Compute phi - statement::ForICount<3, Param<6>, cuda_thread_x_direct, // z - statement::ForICount<0, Param<3>, cuda_thread_y_direct, // m + statement::For<3, cuda_thread_x_direct, // z + statement::For<0, cuda_thread_y_direct, // m // Compute thread-local Phi value and store - statement::ForICount<1, Param<4>, seq_exec, // d - statement::Lambda<3> + statement::For<1, seq_exec, // d + statement::Lambda<3, Segs<0,1,2,3>, Params<0,1,2>, Offsets<0,1,2,3>> > // d > // m >, // z @@ -794,9 +861,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) statement::CudaSyncThreads, // Write out phi from thread local storage - statement::ForICount<3, Param<6>, cuda_thread_x_direct, // z - statement::ForICount<0, Param<3>, cuda_thread_y_direct, // m - statement::Lambda<4> + statement::For<3, cuda_thread_x_direct, // z + statement::For<0, cuda_thread_y_direct, // m + statement::Lambda<4, Segs<0,2,3>, Params<2>> > >, statement::CudaSyncThreads @@ -820,7 +887,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) timer.start(); RAJA::kernel_param( - RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::make_tuple( + RAJA::TypedRangeSegment(0, num_m), RAJA::TypedRangeSegment(0, num_d), RAJA::TypedRangeSegment(0, num_g), RAJA::TypedRangeSegment(0, num_z)), @@ -831,31 +899,26 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // computing a phi value, for shared memory before writing to phi array. RAJA::make_tuple( shmem_L, shmem_psi, - 0.0, - IM(0), - ID(0), - IG(0), - IZ(0)), + 0.0), // Lambda<0> : Load L into shmem - [=] RAJA_DEVICE (IM m, ID d, IG g, IZ z, - shmem_L_t& sh_L, shmem_psi_t&, double&, - IM tm, ID td, IG, IZ) { + [=] RAJA_DEVICE (IM m, ID d, + shmem_L_t& sh_L, + IM tm, ID td) { sh_L(tm, td) = L(m, d); }, // Lambda<1> : Load slice of psi into shmem - [=] RAJA_DEVICE (IM /*m*/, ID d, IG g, IZ z, - shmem_L_t&, shmem_psi_t& sh_psi, double&, - IM, ID td, IG tg, IZ tz) { + [=] RAJA_DEVICE (ID d, IG g, IZ z, + shmem_psi_t& sh_psi, + ID td, IG tg, IZ tz) { sh_psi(td, tg, tz) = psi(d, g, z); }, // Lambda<2> : Load thread-local phi value - [=] RAJA_DEVICE (IM m, ID /*d*/, IG g, IZ z, - shmem_L_t&, shmem_psi_t&, double& phi_local, - IM, ID, IG, IZ) { + [=] RAJA_DEVICE (IM m, IG g, IZ z, + double& phi_local) { phi_local = phi(m, g, z); }, @@ -869,9 +932,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }, // Lambda<4> : Store phi - [=] RAJA_DEVICE (IM m, ID /*d*/, IG g, IZ z, - shmem_L_t&, shmem_psi_t&, double& phi_local, - IM, ID, IG, IZ) { + [=] RAJA_DEVICE (IM m, IG g, IZ z, + double& phi_local) { phi(m, g, z) = phi_local; } @@ -1083,64 +1145,67 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // using RAJA::statement::Param; + using RAJA::Segs; + using RAJA::Params; + using RAJA::Offsets; using EXECPOL = RAJA::KernelPolicy< statement::HipKernelAsync< statement::InitLocalMem, - // Tile outer m,d loops - statement::Tile<0, statement::tile_fixed, seq_exec, // m - statement::Tile<1, statement::tile_fixed, seq_exec, // d - - // Load L for m,d tile into shmem - statement::ForICount<1, Param<4>, hip_thread_x_loop, // d - statement::ForICount<0, Param<3>, hip_thread_y_direct, // m - statement::Lambda<0> + // Tile outer m,d loops + statement::Tile<0, tile_fixed, seq_exec, // m + statement::Tile<1, tile_fixed, seq_exec, // d + + // Load L for m,d tile into shmem + statement::For<1, hip_thread_x_loop, // d + statement::For<0, hip_thread_y_direct, // m + statement::Lambda<0, Segs<0,1>, Params<0>, Offsets<0,1>> > >, statement::HipSyncThreads, // Distribute g, z across blocks and tile z statement::For<2, hip_block_y_loop, // g - statement::Tile<3, statement::tile_fixed, hip_block_x_loop, // z + statement::Tile<3, tile_fixed, hip_block_x_loop, // z // Load phi into thread local storage - statement::ForICount<3, Param<6>, hip_thread_x_direct, // z - statement::ForICount<0, Param<3>, hip_thread_y_direct, // m - statement::Lambda<2> + statement::For<3, hip_thread_x_direct, // z + statement::For<0, hip_thread_y_direct, // m + statement::Lambda<2, Segs<0,2,3>, Params<2>> > >, // Load slice of psi into shmem - statement::ForICount<3, Param<6>, hip_thread_x_direct, // z - statement::ForICount<1, Param<4>, hip_thread_y_loop, // d (reusing y) - statement::Lambda<1> + statement::For<3, hip_thread_x_direct, // z + statement::For<1, hip_thread_y_loop, // d (reusing y) + statement::Lambda<1, Segs<1,2,3>, Params<1>, Offsets<1,2,3>> > >, statement::HipSyncThreads, // Compute phi - statement::ForICount<3, Param<6>, hip_thread_x_direct, // z - statement::ForICount<0, Param<3>, hip_thread_y_direct, // m + statement::For<3, hip_thread_x_direct, // z + statement::For<0, hip_thread_y_direct, // m // Compute thread-local Phi value and store - statement::ForICount<1, Param<4>, seq_exec, // d - statement::Lambda<3> + statement::For<1, seq_exec, // d + statement::Lambda<3, Segs<0,1,2,3>, Params<0,1,2>, Offsets<0,1,2,3>> > // d > // m >, // z - + // finish tile over directions statement::HipSyncThreads, // Write out phi from thread local storage - statement::ForICount<3, Param<6>, hip_thread_x_direct, // z - statement::ForICount<0, Param<3>, hip_thread_y_direct, // m - statement::Lambda<4> + statement::For<3, hip_thread_x_direct, // z + statement::For<0, hip_thread_y_direct, // m + statement::Lambda<4, Segs<0,2,3>, Params<2>> > >, statement::HipSyncThreads - + > // Tile z > // g @@ -1154,13 +1219,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) - RAJA::Timer timer; hipErrchk( hipDeviceSynchronize() ); timer.start(); RAJA::kernel_param( - RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::make_tuple( + RAJA::TypedRangeSegment(0, num_m), RAJA::TypedRangeSegment(0, num_d), RAJA::TypedRangeSegment(0, num_g), RAJA::TypedRangeSegment(0, num_z)), @@ -1171,31 +1236,26 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // computing a phi value, for shared memory before writing to phi array. RAJA::make_tuple( shmem_L, shmem_psi, - 0.0, - IM(0), - ID(0), - IG(0), - IZ(0)), + 0.0), // Lambda<0> : Load L into shmem - [=] RAJA_DEVICE (IM m, ID d, IG g, IZ z, - shmem_L_t& sh_L, shmem_psi_t&, double&, - IM tm, ID td, IG, IZ) { + [=] RAJA_DEVICE (IM m, ID d, + shmem_L_t& sh_L, + IM tm, ID td) { sh_L(tm, td) = L(m, d); }, // Lambda<1> : Load slice of psi into shmem - [=] RAJA_DEVICE (IM /*m*/, ID d, IG g, IZ z, - shmem_L_t&, shmem_psi_t& sh_psi, double&, - IM, ID td, IG tg, IZ tz) { + [=] RAJA_DEVICE (ID d, IG g, IZ z, + shmem_psi_t& sh_psi, + ID td, IG tg, IZ tz) { sh_psi(td, tg, tz) = psi(d, g, z); }, // Lambda<2> : Load thread-local phi value - [=] RAJA_DEVICE (IM m, ID /*d*/, IG g, IZ z, - shmem_L_t&, shmem_psi_t&, double& phi_local, - IM, ID, IG, IZ) { + [=] RAJA_DEVICE (IM m, IG g, IZ z, + double& phi_local) { phi_local = phi(m, g, z); }, @@ -1209,9 +1269,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }, // Lambda<4> : Store phi - [=] RAJA_DEVICE (IM m, ID /*d*/, IG g, IZ z, - shmem_L_t&, shmem_psi_t&, double& phi_local, - IM, ID, IG, IZ) { + [=] RAJA_DEVICE (IM m, IG g, IZ z, + double& phi_local) { phi(m, g, z) = phi_local; } diff --git a/examples/multiview.cpp b/examples/multiview.cpp new file mode 100644 index 0000000000..65975fd144 --- /dev/null +++ b/examples/multiview.cpp @@ -0,0 +1,200 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "RAJA/RAJA.hpp" +#include +#include + +/* + * MultiView Usage Example + * + * A RAJA::MultiView object wraps an array-of-pointers, + * or a pointer-to-pointers, whereas a RAJA::View wraps a single + * pointer or array. This allows a single RAJA::Layout to be applied to + * multiple arrays internal to the MultiView, allowing multiple arrays to share indexing + * arithmetic when their access patterns are the same. + * + * The instantiation of a MultiView works exactly like a standard View, + * except that it takes an array-of-pointers. In the following example, a MultiView + * applies a 1-D layout of length 4 to 2 internal arrays in myarr: + * + * // Arrays of the same size, which will become internal to the MultiView. + * int a1[4] = {5,6,7,8}; + * int a2[4] = {9,10,11,12}; + * + * // Array-of-pointers which will be passed into MultiView. + * int * myarr[2]; + * myarr[0] = a1; + * myarr[1] = a2; + * + * // This MultiView applies a 1-D layout of length 4 to each internal array in myarr. + * RAJA::MultiView< int, RAJA::Layout<1> > MView(myarr, 4); + * + * The default MultiView accesses internal arrays via the 0th index of the MultiView: + * + * MView( 0, 4 ); // accesses the 4th index of the 0th internal array a1, returns value of 8 + * MView( 1, 2 ); // accesses 2nd index of the 1st internal array a2, returns value of 10 + * + * The index into the array-of-pointers can be moved to different + * indices of the MultiView () access operator, rather than the default 0th index. By + * passing a third template parameter to the MultiView constructor, the internal array index + * and the integer indicating which array to access can be reversed: + * + * // MultiView with array-of-pointers index in 1st position + * RAJA::MultiView< int, RAJA::Layout<1>, 1 > MView1(myarr, 4); + * + * MView1( 4, 0 ); // accesses the 4th index of the 0th internal array a1, returns value of 8 + * MView1( 2, 1 ); // accesses 2nd index of the 1st internal array a2, returns value of 10 + * + * As the number of Layout dimensions increases, the index into the array-of-pointers can be + * moved to more distinct locations in the MultiView () access operator. Here is an example + * which compares the accesses of a 2-D layout on a normal RAJA::View with a RAJA::MultiView + * with the array-of-pointers index set to the 2nd position: + * + * RAJA::View< int, RAJA::Layout<2> > normalView(a1, 2, 2); + * + * normalView( 2, 1 ); // accesses 3rd index of the a1 array, value = 7 + * + * // MultiView with array-of-pointers index in 2nd position + * RAJA::MultiView< int, RAJA::Layout<2>, 2 > MView2(myarr, 2, 2); + * + * MView2( 2, 1, 0 ); // accesses the 3rd index of the 0th internal array a1, returns value of 7 (same as normaView(2,1)) + * MView2( 2, 1, 1 ); // accesses the 3rd index of the 1st internal array a2, returns value of 11 + * + * The following code demonstrates 2 aspects of RAJA::MultiView usage: + * - Basic usage + * - Moving of the array-of-pointers index + */ + +void docs_example() +{ + // temporaries + int t1, t2, t3, t4; + + printf( "MultiView Example from RAJA Documentation:\n" ); + + // _multiview_example_1Dinit_start + // Arrays of the same size, which will become internal to the MultiView. + int a1[4] = {5,6,7,8}; + int a2[4] = {9,10,11,12}; + + // Array-of-pointers which will be passed into MultiView. + int * myarr[2]; + myarr[0] = a1; + myarr[1] = a2; + + // This MultiView applies a 1-D layout of length 4 to each internal array in myarr. + RAJA::MultiView< int, RAJA::Layout<1> > MView(myarr, 4); + // _multiview_example_1Dinit_end + + // _multiview_example_1Daccess_start + t1 = MView( 0, 3 ); // accesses the 4th index of the 0th internal array a1, returns value of 8 + t2 = MView( 1, 2 ); // accesses 3rd index of the 1st internal array a2, returns value of 11 + // _multiview_example_1Daccess_end + + // _multiview_example_1Daopindex_start + // MultiView with array-of-pointers index in 1st position. + RAJA::MultiView< int, RAJA::Layout<1>, 1 > MView1(myarr, 4); + + t3 = MView1( 3, 0 ); // accesses the 4th index of the 0th internal array a1, returns value of 8 + t4 = MView1( 2, 1 ); // accesses 3rd index of the 1st internal array a2, returns value of 11 + // _multiview_example_1Daopindex_end + + printf( "Comparison of default MultiView with another MultiView that has the array-of-pointers index in the 1st position of the () accessor:\n" ); + printf( "MView( 0, 3 ) = %i, MView1( 3, 0 ) = %i\n", t1, t3 ); + printf( "MView( 1, 2 ) = %i, MView1( 2, 1 ) = %i\n", t2, t4 ); + + // _multiview_example_2Daopindex_start + RAJA::View< int, RAJA::Layout<2> > normalView(a1, 2, 2); + + t1 = normalView( 1, 1 ); // accesses 4th index of the a1 array, value = 8 + + // MultiView with array-of-pointers index in 2nd position + RAJA::MultiView< int, RAJA::Layout<2>, 2 > MView2(myarr, 2, 2); + + t2 = MView2( 1, 1, 0 ); // accesses the 4th index of the 0th internal array a1, returns value of 8 (same as normalView(1,1)) + t3 = MView2( 0, 0, 1 ); // accesses the 1st index of the 1st internal array a2, returns value of 9 + // _multiview_example_2Daopindex_end + + printf( "Comparison of 2D normal View with 2D MultiView that has the array-of-pointers index in the 2nd position of the () accessor:\n" ); + printf( "normalView( 1, 1 ) = %i, MView2( 1, 1, 0 ) = %i\n", t1, t2 ); +} + +int main() +{ + docs_example(); + + constexpr int N = 12; + int * myarr[2]; // two 3x4 arrays + int arr1[N]; + int arr2[N]; + + for ( int ii = 0; ii < N; ++ii ) + { + arr1[ii] = 100 + ii; + arr2[ii] = 200 + ii; + } + + myarr[0] = arr1; + myarr[1] = arr2; + + // 4x3 layout + std::array perm { {0, 1} }; + RAJA::Layout<2> layout = RAJA::make_permuted_layout( + { {4, 3} }, perm + ); + + // Basic MultiView usage + // Default usage: no specified array-of-pointers index moving + // 0th position is used as the array-of-pointers index + RAJA::MultiView> arrView(myarr, layout); + + // Moved array-of-pointers index MultiView usage + // Add an array-of-pointers index specifier + constexpr int aopidx = 1; + RAJA::MultiView, aopidx> arrViewMov(myarr, layout); + + // Comparing values of both views + printf ( "Comparing values of both default and 1-index-ed MultiViews:\n" ); + for ( int pp = 0; pp < 2; ++pp ) + { + for ( int kk = 0; kk < 4; ++kk ) + { + for ( int jj = 0; jj < 3; ++jj ) + { + printf ( "arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj, arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj) ); + } + } + } + + // switch values + printf ( "Switching values\n" ); + for ( int kk = 0; kk < 4; ++kk ) + { + for ( int jj = 0; jj < 3; ++jj ) + { + int temp = arrView(0, kk, jj); + arrView(0, kk, jj) = arrView(1, kk, jj); + arrView(1, kk, jj) = temp; + } + } + + // Comparing switched values of both views + printf ( "Comparing switched values of both default and 1-index-ed MultiViews:\n" ); + for ( int pp = 0; pp < 2; ++pp ) + { + for ( int kk = 0; kk < 4; ++kk ) + { + for ( int jj = 0; jj < 3; ++jj ) + { + printf ( "arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj, arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj) ); + } + } + } + + return 0; +} diff --git a/examples/plugin/CMakeLists.txt b/examples/plugin/CMakeLists.txt index bbe173e26f..bb67edc4e6 100644 --- a/examples/plugin/CMakeLists.txt +++ b/examples/plugin/CMakeLists.txt @@ -8,3 +8,11 @@ raja_add_executable( NAME plugin-example SOURCES test-plugin.cpp counter-plugin.cpp) + +raja_add_executable( + NAME plugin-example-dynamic + SOURCES test-plugin-dynamic.cpp) + +raja_add_plugin_library(NAME timer_plugin + SHARED TRUE + SOURCES timer-plugin.cpp) diff --git a/examples/plugin/counter-plugin.cpp b/examples/plugin/counter-plugin.cpp index 146bc86d23..87b0bc13a2 100644 --- a/examples/plugin/counter-plugin.cpp +++ b/examples/plugin/counter-plugin.cpp @@ -5,6 +5,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// _plugin_example_start #include "RAJA/util/PluginStrategy.hpp" #include @@ -13,20 +14,37 @@ class CounterPlugin : public RAJA::util::PluginStrategy { public: - void preLaunch(RAJA::util::PluginContext p) { + void preCapture(const RAJA::util::PluginContext& p) override { if (p.platform == RAJA::Platform::host) - std::cout << " [CounterPlugin]: Launching host kernel for the " << ++host_counter << " time!" << std::endl; + std::cout << " [CounterPlugin]: Capturing host kernel for the " << ++host_capture_counter << " time!" << std::endl; else - std::cout << " [CounterPlugin]: Launching device kernel for the " << ++device_counter << " time!" << std::endl; + std::cout << " [CounterPlugin]: Capturing device kernel for the " << ++device_capture_counter << " time!" << std::endl; } - void postLaunch(RAJA::util::PluginContext RAJA_UNUSED_ARG(p)) { + void preLaunch(const RAJA::util::PluginContext& p) override { + if (p.platform == RAJA::Platform::host) + { + std::cout << " [CounterPlugin]: Launching host kernel for the " << ++host_launch_counter << " time!" << std::endl; + } + else + { + std::cout << " [CounterPlugin]: Launching device kernel for the " << ++device_launch_counter << " time!" << std::endl; + } } private: - int host_counter; - int device_counter; + int host_capture_counter; + int device_capture_counter; + int host_launch_counter; + int device_launch_counter; }; -// Regiser plugin with the PluginRegistry -static RAJA::util::PluginRegistry::Add P("counter-plugin", "Counter"); +// Statically loading plugin. +static RAJA::util::PluginRegistry::add P("Counter", "Counts number of kernel launches."); + +// Dynamically loading plugin. +extern "C" RAJA::util::PluginStrategy *getPlugin () +{ + return new CounterPlugin; +} +// _plugin_example_end diff --git a/examples/plugin/test-plugin-dynamic.cpp b/examples/plugin/test-plugin-dynamic.cpp new file mode 100644 index 0000000000..4e2cb202f4 --- /dev/null +++ b/examples/plugin/test-plugin-dynamic.cpp @@ -0,0 +1,22 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "RAJA/RAJA.hpp" +#include + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + RAJA::util::init_plugins("../lib/libtimer_plugin.so"); + + double *a = new double[10]; + for (int i = 0; i < 4; i++) + { + RAJA::forall(RAJA::RangeSegment(0, 10), [=](int i) { + a[i] = 0; + }); + } +} diff --git a/examples/plugin/timer-plugin.cpp b/examples/plugin/timer-plugin.cpp new file mode 100644 index 0000000000..248a514df8 --- /dev/null +++ b/examples/plugin/timer-plugin.cpp @@ -0,0 +1,48 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "RAJA/util/PluginStrategy.hpp" + +#include +#include + +class TimerPlugin : public RAJA::util::PluginStrategy +{ +public: + void preLaunch(const RAJA::util::PluginContext& RAJA_UNUSED_ARG(p)) override + { + start_time = std::chrono::steady_clock::now(); + } + + void postLaunch(const RAJA::util::PluginContext& p) override + { + end_time = std::chrono::steady_clock::now(); + double elapsedMs = std::chrono::duration(end_time - start_time).count(); + + if (p.platform == RAJA::Platform::host) + { + printf("[TimerPlugin]: Elapsed time of host kernel was %f ms\n", elapsedMs); + } + else + { + printf("[TimerPlugin]: Elapsed time of device kernel was %f ms\n", elapsedMs); + } + } + +private: + std::chrono::steady_clock::time_point start_time; + std::chrono::steady_clock::time_point end_time; +}; + +// Dynamically loading plugin. +extern "C" RAJA::util::PluginStrategy *getPlugin() +{ + return new TimerPlugin; +} + +// Statically loading plugin. +static RAJA::util::PluginRegistry::add P("Timer", "Prints elapsed time of kernel executions."); \ No newline at end of file diff --git a/examples/raja-teams.cpp b/examples/raja-teams.cpp new file mode 100644 index 0000000000..870b176bed --- /dev/null +++ b/examples/raja-teams.cpp @@ -0,0 +1,191 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" +#include "camp/resource.hpp" + + +/* + * RAJA Teams Example: Upper Triangular Pattern + Shared Memory + * + * Teams introduces hierarchal parallelism through the concept of + * teams and threads. Computation is executed in a pre-defined grid + * composed of threads and grouped into teams. The teams model enables + * developers to express parallelism through loops over teams, and inner loops + * over threads. Team loops are executed in parallel and + * threads within a team should be treated as sub-parallel regions. + * + * Team shared memory is allocated between team and thread loops. + * Memory allocated within thread loops are thread private. + * The example below demonstrates composing an upper triangular + * loop pattern, and using shared memory. + * + */ + +/* + * Define host/device launch policies + */ +using launch_policy = RAJA::expt::LaunchPolicy< +#if defined(RAJA_ENABLE_OPENMP) + RAJA::expt::omp_launch_t +#else + RAJA::expt::seq_launch_t +#endif +#if defined(RAJA_ENABLE_CUDA) + , + RAJA::expt::cuda_launch_t +#endif +#if defined(RAJA_ENABLE_HIP) + , + RAJA::expt::hip_launch_t +#endif + >; + +/* + * Define team policies. + * Up to 3 dimension are supported: x,y,z + */ +using teams_x = RAJA::expt::LoopPolicy< +#if defined(RAJA_ENABLE_OPENMP) + RAJA::omp_parallel_for_exec +#else + RAJA::loop_exec +#endif +#if defined(RAJA_ENABLE_CUDA) + , + RAJA::cuda_block_x_direct +#endif +#if defined(RAJA_ENABLE_HIP) + , + RAJA::hip_block_x_direct +#endif + >; +/* + * Define thread policies. + * Up to 3 dimension are supported: x,y,z + */ +using threads_x = RAJA::expt::LoopPolicy; + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + // Resource object for host + camp::resources::Host host_res; + + // Resource objects for CUDA or HIP +#if defined(RAJA_ENABLE_CUDA) + camp::resources::Cuda device_res; +#endif + +#if defined(RAJA_ENABLE_HIP) + camp::resources::Hip device_res; +#endif + + std::cout << "\n Running RAJA-Teams examples...\n"; + int num_of_backends = 1; +#if defined(RAJA_ENABLE_DEVICE) + num_of_backends++; +#endif + + // RAJA teams may switch between host and device policies at run time. + // The loop below will execute through the available backends. + + for (int exec_place = 0; exec_place < num_of_backends; ++exec_place) { + + RAJA::expt::ExecPlace select_cpu_or_gpu = (RAJA::expt::ExecPlace)exec_place; + + // auto select_cpu_or_gpu = RAJA::HOST; + // auto select_cpu_or_gpu = RAJA::DEVICE; + + // Allocate memory for either host or device + int N_tri = 5; + + int *Ddat; + if (select_cpu_or_gpu == RAJA::expt::HOST) + Ddat = host_res.allocate(N_tri * N_tri); + +#if defined(RAJA_ENABLE_DEVICE) + if (select_cpu_or_gpu == RAJA::expt::DEVICE) + Ddat = device_res.allocate(N_tri * N_tri); +#endif + + /* + * RAJA::expt::launch just starts a "kernel" and doesn't provide any looping. + * + * The first argument determines which policy should be executed, + * + * The second argument is the number of teams+threads needed for each of the + * policies. + * + * Third argument is the lambda. + * + * The lambda takes a "resource" object, which has the teams+threads + * and is used to perform thread synchronizations within a team. + */ + + if (select_cpu_or_gpu == RAJA::expt::HOST){ + std::cout << "\n Running Upper triangular pattern example on the host...\n"; + }else { + std::cout << "\n Running Upper triangular pattern example on the device...\n"; + } + + + RAJA::View> D(Ddat, N_tri, N_tri); + + RAJA::expt::launch(select_cpu_or_gpu, + RAJA::expt::Resources(RAJA::expt::Teams(N_tri), RAJA::expt::Threads(N_tri)), + [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) { + + // Array shared within threads of the same team + TEAM_SHARED int s_A[1]; + + RAJA::expt::loop(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) { + if (c == r) s_A[0] = r; + D(r, c) = r * N_tri + c; + }); // loop j + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) { + + printf("r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]); + + }); // loop c + }); // loop r + }); // outer lambda + + if (select_cpu_or_gpu == RAJA::expt::HOST) { + host_res.deallocate(Ddat); + } + +#if defined(RAJA_ENABLE_DEVICE) + if (select_cpu_or_gpu == RAJA::expt::DEVICE) { + device_res.deallocate(Ddat); + } +#endif + + } // Execution places loop + + +} // Main diff --git a/examples/red-black-gauss-seidel.cpp b/examples/red-black-gauss-seidel.cpp index 9eb6401a35..6d24cee1fe 100644 --- a/examples/red-black-gauss-seidel.cpp +++ b/examples/red-black-gauss-seidel.cpp @@ -14,7 +14,7 @@ #include "RAJA/RAJA.hpp" -#include "memoryManager.hpp" +#include "camp/resource.hpp" /* * Gauss-Seidel with Red-Black Ordering Example @@ -65,7 +65,8 @@ struct grid_s { */ double solution(double x, double y); void computeErr(double *I, grid_s grid); -RAJA::TypedIndexSet gsColorPolicy(int N); +RAJA::TypedIndexSet + gsColorPolicy(int N, camp::resources::Resource& res); int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -97,11 +98,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) gridx.h = 1.0 / (N + 1.0); gridx.n = N + 2; - double *I = memoryManager::allocate(NN); + camp::resources::Resource resource{camp::resources::Host()}; + + double *I = resource.allocate(NN); memset(I, 0, NN * sizeof(double)); - RAJA::TypedIndexSet colorSet = gsColorPolicy(N); + RAJA::TypedIndexSet colorSet = gsColorPolicy(N, resource); memset(I, 0, NN * sizeof(double)); @@ -160,8 +163,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) computeErr(I, gridx); printf("No of iterations: %d \n \n", iteration); - - memoryManager::deallocate(I); + resource.deallocate(I); return 0; } @@ -172,9 +174,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // to generate RAJA ListSegments and populate a RAJA Static Index // Set. -RAJA::TypedIndexSet gsColorPolicy(int N) +RAJA::TypedIndexSet + gsColorPolicy(int N, camp::resources::Resource& res) { - RAJA::TypedIndexSet colorSet; int redN = ceil(N * N / 2); @@ -205,8 +207,8 @@ RAJA::TypedIndexSet gsColorPolicy(int N) } // Create Index - colorSet.push_back(RAJA::ListSegment(Blk, blkN)); - colorSet.push_back(RAJA::ListSegment(Red, redN)); + colorSet.push_back(RAJA::ListSegment(Blk, blkN, res)); + colorSet.push_back(RAJA::ListSegment(Red, redN, res)); delete[] Blk; delete[] Red; diff --git a/examples/resource-forall.cpp b/examples/resource-forall.cpp new file mode 100644 index 0000000000..a83ac7264c --- /dev/null +++ b/examples/resource-forall.cpp @@ -0,0 +1,376 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" +#include "RAJA/util/resource.hpp" + +/* + * Vector Addition Example + * + * Computes c = a + b, where a, b, c are vectors of ints. + * It illustrates similarities between a C-style for-loop and a RAJA + * forall loop. + * + * RAJA features shown: + * - `forall` loop iteration template method + * - Index range segment + * - Execution policies + * - `forall` with Resource argument + * - Cuda/Hip streams w/ Resource + * - Resources events + * + */ + + +// +// Functions for checking and printing results +// +void checkResult(int* res, int len); +void printResult(int* res, int len); + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA vector addition example...\n"; + +// +// Define vector length +// + const int N = 100000; + +// +// Allocate and initialize vector data +// + RAJA::resources::Host host{}; + + int *a = host.allocate(N); + int *b = host.allocate(N); + int *c = host.allocate(N); + + int *a_ = host.allocate(N); + int *b_ = host.allocate(N); + int *c_ = host.allocate(N); + + + for (int i = 0; i < N; ++i) { + a[i] = -i; + b[i] = 2 * i; + a_[i] = -i; + b_[i] = 2 * i; + + } + + +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style vector addition...\n"; + + for (int i = 0; i < N; ++i) { + c[i] = a[i] + b[i]; + } + + checkResult(c, N); + + +//----------------------------------------------------------------------------// +// RAJA::seq_exec policy enforces strictly sequential execution.... +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA sequential vector addition...\n"; + + + RAJA::forall(host, RAJA::RangeSegment(0, N), [=] (int i) { + c[i] = a[i] + b[i]; + }); + + checkResult(c, N); + +//----------------------------------------------------------------------------// +// RAJA::loop_exec policy enforces loop execution.... +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA loop vector addition...\n"; + + RAJA::forall(host, RAJA::RangeSegment(0, N), [=] (int i) { + c[i] = a[i] + b[i]; + }); + + checkResult(c, N); + +//----------------------------------------------------------------------------// +// RAJA::sind_exec policy enforces simd execution.... +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA simd_exec vector addition...\n"; + + RAJA::forall(host, RAJA::RangeSegment(0, N), [=] (int i) { + c[i] = a[i] + b[i]; + }); + + checkResult(c, N); + +#if defined(RAJA_ENABLE_OPENMP) +//----------------------------------------------------------------------------// +// RAJA::omp_for_parallel_exec policy execution.... +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA omp_parallel vector addition...\n"; + + RAJA::forall>(host, RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { + c[i] = a[i] + b[i]; + }); + + checkResult(c, N); + +//----------------------------------------------------------------------------// +// RAJA::omp_for_nowait_exec policy execution.... +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA omp_for_nowait vector addition...\n"; + + RAJA::forall(host, RAJA::RangeSegment(0, N), [=] (int i) { + c[i] = a[i] + b[i]; + }); + + checkResult(c, N); + +//----------------------------------------------------------------------------// +// RAJA::omp_for_exec policy execution.... +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA omp_for_exec vector addition...\n"; + + RAJA::forall(host, RAJA::RangeSegment(0, N), [=] (int i) { + c[i] = a[i] + b[i]; + }); + + checkResult(c, N); +#endif + +#if defined(RAJA_ENABLE_TBB) +//----------------------------------------------------------------------------// +// RAJA::tbb_for_dynamic policy execution.... +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA tbb_for_dynamic vector addition...\n"; + + RAJA::forall(host, RAJA::RangeSegment(0, N), [=] (int i) { + c[i] = a[i] + b[i]; + }); + + checkResult(c, N); + +//----------------------------------------------------------------------------// +// RAJA::tbb_for_static policy execution.... +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA tbb_for_static<8> vector addition...\n"; + + RAJA::forall>(host, RAJA::RangeSegment(0, N), [=] (int i) { + c[i] = a[i] + b[i]; + }); + + checkResult(c, N); +#endif + + + +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + +/* + GPU_BLOCK_SIZE - specifies the number of threads in a CUDA/HIP thread block +*/ +const int GPU_BLOCK_SIZE = 256; + +//----------------------------------------------------------------------------// +// RAJA::cuda/hip_exec policy execution.... +//----------------------------------------------------------------------------// +{ + std::cout << "\n Running RAJA GPU vector addition on 2 seperate streams...\n"; +#if defined(RAJA_ENABLE_CUDA) + RAJA::resources::Cuda res_gpu1; + RAJA::resources::Cuda res_gpu2; + using EXEC_POLICY = RAJA::cuda_exec_async; +#elif defined(RAJA_ENABLE_HIP) + RAJA::resources::Hip res_gpu1; + RAJA::resources::Hip res_gpu2; + using EXEC_POLICY = RAJA::hip_exec_async; +#endif + + int* d_a1 = res_gpu1.allocate(N); + int* d_b1 = res_gpu1.allocate(N); + int* d_c1 = res_gpu1.allocate(N); + + int* d_a2 = res_gpu2.allocate(N); + int* d_b2 = res_gpu2.allocate(N); + int* d_c2 = res_gpu2.allocate(N); + + res_gpu1.memcpy(d_a1, a, sizeof(int)* N); + res_gpu1.memcpy(d_b1, b, sizeof(int)* N); + + res_gpu2.memcpy(d_a2, a, sizeof(int)* N); + res_gpu2.memcpy(d_b2, b, sizeof(int)* N); + + + RAJA::forall(res_gpu1, RAJA::RangeSegment(0, N), + [=] RAJA_DEVICE (int i) { + d_c1[i] = d_a1[i] + d_b1[i]; + }); + + RAJA::forall(res_gpu2, RAJA::RangeSegment(0, N), + [=] RAJA_DEVICE (int i) { + d_c2[i] = d_a2[i] + d_b2[i]; + }); + + res_gpu1.memcpy(c, d_c1, sizeof(int)*N ); + + res_gpu2.memcpy(c_, d_c2, sizeof(int)*N ); + + checkResult(c, N); + checkResult(c_, N); + + res_gpu1.deallocate(d_a1); + res_gpu1.deallocate(d_b1); + res_gpu1.deallocate(d_c1); + + res_gpu2.deallocate(d_a2); + res_gpu2.deallocate(d_b2); + res_gpu2.deallocate(d_c2); +} + + +//----------------------------------------------------------------------------// +// RAJA::cuda/hip_exec policy with waiting event.... +//----------------------------------------------------------------------------// +{ + std::cout << "\n Running RAJA GPU vector with dependency between two seperate streams...\n"; +#if defined(RAJA_ENABLE_CUDA) + // _raja_res_defres_start + RAJA::resources::Cuda res_gpu1; + RAJA::resources::Cuda res_gpu2; + RAJA::resources::Host res_host; + + using EXEC_POLICY = RAJA::cuda_exec_async; + // _raja_res_defres_end +#elif defined(RAJA_ENABLE_HIP) + RAJA::resources::Hip res_gpu1; + RAJA::resources::Hip res_gpu2; + RAJA::resources::Host res_host; + + using EXEC_POLICY = RAJA::hip_exec_async; +#endif + + // _raja_res_alloc_start + int* d_array1 = res_gpu1.allocate(N); + int* d_array2 = res_gpu2.allocate(N); + int* h_array = res_host.allocate(N); + // _raja_res_alloc_end + + // _raja_res_k1_start + RAJA::forall(res_gpu1, RAJA::RangeSegment(0,N), + [=] RAJA_HOST_DEVICE (int i) { + d_array1[i] = i; + } + ); + // _raja_res_k1_end + + // _raja_res_k2_start + RAJA::resources::Event e = RAJA::forall(res_gpu2, RAJA::RangeSegment(0,N), + [=] RAJA_HOST_DEVICE (int i) { + d_array2[i] = -1; + } + ); + // _raja_res_k2_end + + // _raja_res_wait_start + res_gpu2.wait_for(&e); + // _raja_res_wait_end + + // _raja_res_k3_start + RAJA::forall(res_gpu1, RAJA::RangeSegment(0,N), + [=] RAJA_HOST_DEVICE (int i) { + d_array1[i] *= d_array2[i]; + } + ); + // _raja_res_k3_end + + // _raja_res_memcpy_start + res_gpu1.memcpy(h_array, d_array1, sizeof(int) * N); + // _raja_res_memcpy_end + + // _raja_res_k4_start + bool check = true; + RAJA::forall(res_host, RAJA::RangeSegment(0,N), + [&check, h_array] (int i) { + if(h_array[i] != -i) {check = false;} + } + ); + // _raja_res_k4_end + + std::cout << "\n result -- "; + if (check) std::cout << "PASS\n"; + else std::cout << "FAIL\n"; + + res_gpu1.deallocate(d_array1); + res_gpu2.deallocate(d_array2); + res_host.deallocate(h_array); + +} + +#endif +// +// +// Clean up. +// + host.deallocate(a); + host.deallocate(b); + host.deallocate(c); + + host.deallocate(a_); + host.deallocate(b_); + host.deallocate(c_); + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to check result and report P/F. +// +void checkResult(int* res, int len) +{ + bool correct = true; + for (int i = 0; i < len; i++) { + if ( res[i] != i ) { correct = false; } + } + if ( correct ) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +} + +// +// Function to print result. +// +void printResult(int* res, int len) +{ + std::cout << std::endl; + for (int i = 0; i < len; i++) { + std::cout << "result[" << i << "] = " << res[i] << std::endl; + } + std::cout << std::endl; +} + diff --git a/examples/tut_dot-product.cpp b/examples/tut_dot-product.cpp index a1caab4ac5..a3e853697e 100644 --- a/examples/tut_dot-product.cpp +++ b/examples/tut_dot-product.cpp @@ -59,8 +59,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate and initialize vector data // - int *a = memoryManager::allocate(N); - int *b = memoryManager::allocate(N); + double *a = memoryManager::allocate(N); + double *b = memoryManager::allocate(N); for (int i = 0; i < N; ++i) { a[i] = 1.0; diff --git a/examples/tut_halo-exchange.cpp b/examples/tut_halo-exchange.cpp new file mode 100644 index 0000000000..bd7b823e76 --- /dev/null +++ b/examples/tut_halo-exchange.cpp @@ -0,0 +1,1871 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" +#include "RAJA/util/Timer.hpp" + +/* + * Halo exchange Example + * + * Packs and Unpacks data from 3D variables as is done in a halo exchange. + * It illustrates how to use the workgroup set of constructs. + * + * RAJA features shown: + * - `WorkPool` template object + * - `WorkGroup` template object + * - `WorkSite` template object + * - Index range segment + * - WorkGroup policies + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +/* + CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block when using forall + CUDA_WORKGROUP_BLOCK_SIZE - specifies the number of threads in a CUDA thread block when using workgroup +*/ +#if defined(RAJA_ENABLE_CUDA) +const int CUDA_BLOCK_SIZE = 256; +const int CUDA_WORKGROUP_BLOCK_SIZE = 1024; +#endif + +#if defined(RAJA_ENABLE_HIP) +const int HIP_BLOCK_SIZE = 256; +const int HIP_WORKGROUP_BLOCK_SIZE = 1024; +#endif + +/* + num_neighbors - specifies the number of neighbors that each process would be + communicating with in 3D halo exchange +*/ +const int num_neighbors = 26; + +// +// Functions for checking and printing results +// +void checkResult(std::vector const& vars, std::vector const& vars_ref, + int var_size, int num_vars); +void printResult(std::vector const& vars, int var_size, int num_vars); + +// +// Functions for allocating and populating packing and unpacking lists +// +void create_pack_lists(std::vector& pack_index_lists, std::vector& pack_index_list_lengths, + const int halo_width, const int* grid_dims); +void create_unpack_lists(std::vector& unpack_index_lists, std::vector& unpack_index_list_lengths, + const int halo_width, const int* grid_dims); +void destroy_pack_lists(std::vector& pack_index_lists); +void destroy_unpack_lists(std::vector& unpack_index_lists); + + +template < typename T > +struct memory_manager_allocator +{ + using value_type = T; + + memory_manager_allocator() = default; + + template < typename U > + constexpr memory_manager_allocator(memory_manager_allocator const&) noexcept + { } + + /*[[nodiscard]]*/ + value_type* allocate(size_t num) + { + if (num > std::numeric_limits::max() / sizeof(value_type)) { + throw std::bad_alloc(); + } + + value_type *ptr = memoryManager::allocate(num); + + if (!ptr) { + throw std::bad_alloc(); + } + + return ptr; + } + + void deallocate(value_type* ptr, size_t) noexcept + { + value_type* ptrc = static_cast(ptr); + memoryManager::deallocate(ptrc); + } +}; + +template +bool operator==(memory_manager_allocator const&, memory_manager_allocator const&) +{ + return true; +} + +template +bool operator!=(memory_manager_allocator const& lhs, memory_manager_allocator const& rhs) +{ + return !(lhs == rhs); +} + +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + +template < typename T > +struct pinned_allocator +{ + using value_type = T; + + pinned_allocator() = default; + + template < typename U > + constexpr pinned_allocator(pinned_allocator const&) noexcept + { } + + /*[[nodiscard]]*/ + value_type* allocate(size_t num) + { + if (num > std::numeric_limits::max() / sizeof(value_type)) { + throw std::bad_alloc(); + } + + value_type *ptr = nullptr; +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaMallocHost((void **)&ptr, num*sizeof(value_type))); +#elif defined(RAJA_ENABLE_HIP) + hipErrchk(hipHostMalloc((void **)&ptr, num*sizeof(value_type))); +#endif + + if (!ptr) { + throw std::bad_alloc(); + } + + return ptr; + } + + void deallocate(value_type* ptr, size_t) noexcept + { +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaFreeHost(ptr)); +#elif defined(RAJA_ENABLE_HIP) + hipErrchk(hipHostFree(ptr)); +#endif + } +}; + +template +bool operator==(pinned_allocator const&, pinned_allocator const&) +{ + return true; +} + +template +bool operator!=(pinned_allocator const& lhs, pinned_allocator const& rhs) +{ + return !(lhs == rhs); +} + + +#endif + +int main(int argc, char **argv) +{ + + std::cout << "\n\nRAJA halo exchange example...\n"; + + if (argc != 1 && argc != 7) { + std::cerr << "Usage: tut_halo-exchange " + << "[grid_x grid_y grid_z halo_width num_vars num_cycles]\n"; + std::exit(1); + } + + // _halo_exchange_input_params_start + // + // Define grid dimensions + // Define halo width + // Define number of grid variables + // Define number of cycles + // + const int grid_dims[3] = { (argc != 7) ? 100 : std::atoi(argv[1]), + (argc != 7) ? 100 : std::atoi(argv[2]), + (argc != 7) ? 100 : std::atoi(argv[3]) }; + const int halo_width = (argc != 7) ? 1 : std::atoi(argv[4]); + const int num_vars = (argc != 7) ? 3 : std::atoi(argv[5]); + const int num_cycles = (argc != 7) ? 3 : std::atoi(argv[6]); + // _halo_exchange_input_params_end + + std::cout << "grid dimensions " << grid_dims[0] + << " x " << grid_dims[1] + << " x " << grid_dims[2] << "\n" + << "halo width " << halo_width << "\n" + << "number of variables " << num_vars << "\n" + << "number of cycles " << num_cycles << "\n"; + + if ( grid_dims[0] < halo_width || + grid_dims[1] < halo_width || + grid_dims[2] < halo_width ) { + std::cerr << "Error: " + << "grid dimensions must not be smaller than the halo width\n"; + std::exit(1); + } + + const int grid_plus_halo_dims[3] = { grid_dims[0] + 2*halo_width, + grid_dims[1] + 2*halo_width, + grid_dims[2] + 2*halo_width }; + + const int var_size = grid_plus_halo_dims[0] * + grid_plus_halo_dims[1] * + grid_plus_halo_dims[2] ; + + // _halo_exchange_vars_allocate_start + // + // Allocate grid variables and reference grid variables used to check + // correctness. + // + std::vector vars (num_vars, nullptr); + std::vector vars_ref(num_vars, nullptr); + + for (int v = 0; v < num_vars; ++v) { + vars[v] = memoryManager::allocate(var_size); + vars_ref[v] = memoryManager::allocate(var_size); + } + // _halo_exchange_vars_allocate_end + + + // _halo_exchange_index_list_generate_start + // + // Generate index lists for packing and unpacking + // + std::vector pack_index_lists(num_neighbors, nullptr); + std::vector pack_index_list_lengths(num_neighbors, 0); + create_pack_lists(pack_index_lists, pack_index_list_lengths, halo_width, grid_dims); + + std::vector unpack_index_lists(num_neighbors, nullptr); + std::vector unpack_index_list_lengths(num_neighbors, 0); + create_unpack_lists(unpack_index_lists, unpack_index_list_lengths, halo_width, grid_dims); + // _halo_exchange_index_lisgeneratete_end + + + // + // Convenience type alias to reduce typing + // + using range_segment = RAJA::TypedRangeSegment; + + + auto timer = RAJA::Timer(); + + +//----------------------------------------------------------------------------// + { + std::cout << "\n Running C-style halo exchange...\n"; + + double minCycle = std::numeric_limits::max(); + + + std::vector buffers(num_neighbors, nullptr); + + for (int l = 0; l < num_neighbors; ++l) { + + int buffer_len = num_vars * pack_index_list_lengths[l]; + + buffers[l] = memoryManager::allocate(buffer_len); + + } + + for (int c = 0; c < num_cycles; ++c ) { + timer.start(); + { + + // set vars + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + for (int i = 0; i < var_size; i++) { + var[i] = i + v; + } + } + + // _halo_exchange_sequential_cstyle_packing_start + for (int l = 0; l < num_neighbors; ++l) { + + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; + + // pack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + for (int i = 0; i < len; i++) { + buffer[i] = var[list[i]]; + } + + buffer += len; + } + + // send single message + } + // _halo_exchange_sequential_cstyle_packing_end + + // _halo_exchange_sequential_cstyle_packing_start + for (int l = 0; l < num_neighbors; ++l) { + + // recv single message + + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; + + // unpack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + for (int i = 0; i < len; i++) { + var[list[i]] = buffer[i]; + } + + buffer += len; + } + } + // _halo_exchange_sequential_cstyle_unpacking_end + + } + timer.stop(); + + RAJA::Timer::ElapsedType tCycle = timer.elapsed(); + if (tCycle < minCycle) minCycle = tCycle; + timer.reset(); + } + + for (int l = 0; l < num_neighbors; ++l) { + + memoryManager::deallocate(buffers[l]); + + } + + std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + + // copy result of exchange for reference later + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + double* var_ref = vars_ref[v]; + + for (int i = 0; i < var_size; i++) { + var_ref[i] = var[i]; + } + } + } + + +//----------------------------------------------------------------------------// +// Separate packing/unpacking loops using forall +//----------------------------------------------------------------------------// + { + std::cout << "\n Running RAJA loop forall halo exchange...\n"; + + double minCycle = std::numeric_limits::max(); + + // _halo_exchange_loop_forall_policies_start + using forall_policy = RAJA::loop_exec; + // _halo_exchange_loop_forall_policies_end + + std::vector buffers(num_neighbors, nullptr); + + for (int l = 0; l < num_neighbors; ++l) { + + int buffer_len = num_vars * pack_index_list_lengths[l]; + + buffers[l] = memoryManager::allocate(buffer_len); + + } + + for (int c = 0; c < num_cycles; ++c ) { + timer.start(); + { + + // set vars + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + RAJA::forall(range_segment(0, var_size), [=] (int i) { + var[i] = i + v; + }); + } + + // _halo_exchange_loop_forall_packing_start + for (int l = 0; l < num_neighbors; ++l) { + + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; + + // pack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + RAJA::forall(range_segment(0, len), [=] (int i) { + buffer[i] = var[list[i]]; + }); + + buffer += len; + } + + // send single message + } + // _halo_exchange_loop_forall_packing_end + + // _halo_exchange_loop_forall_unpacking_start + for (int l = 0; l < num_neighbors; ++l) { + + // recv single message + + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; + + // unpack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + RAJA::forall(range_segment(0, len), [=] (int i) { + var[list[i]] = buffer[i]; + }); + + buffer += len; + } + } + // _halo_exchange_loop_forall_unpacking_end + + } + timer.stop(); + + RAJA::Timer::ElapsedType tCycle = timer.elapsed(); + if (tCycle < minCycle) minCycle = tCycle; + timer.reset(); + } + + for (int l = 0; l < num_neighbors; ++l) { + + memoryManager::deallocate(buffers[l]); + + } + + std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + + // check results against reference copy + checkResult(vars, vars_ref, var_size, num_vars); + //printResult(vars, var_size, num_vars); + } + + +//----------------------------------------------------------------------------// +// RAJA::WorkGroup with allows deferred execution +// This has overhead and indirection not in the separate loop version, +// but can be useful for debugging. +//----------------------------------------------------------------------------// + { + std::cout << "\n Running RAJA loop workgroup halo exchange...\n"; + + double minCycle = std::numeric_limits::max(); + + // _halo_exchange_loop_workgroup_policies_start + using forall_policy = RAJA::loop_exec; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::loop_work, + RAJA::ordered, + RAJA::ragged_array_of_objects >; + + using workpool = RAJA::WorkPool< workgroup_policy, + int, + RAJA::xargs<>, + memory_manager_allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + int, + RAJA::xargs<>, + memory_manager_allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + int, + RAJA::xargs<>, + memory_manager_allocator >; + // _halo_exchange_loop_workgroup_policies_end + + std::vector buffers(num_neighbors, nullptr); + + for (int l = 0; l < num_neighbors; ++l) { + + int buffer_len = num_vars * pack_index_list_lengths[l]; + + buffers[l] = memoryManager::allocate(buffer_len); + + } + + workpool pool_pack (memory_manager_allocator{}); + workpool pool_unpack(memory_manager_allocator{}); + + for (int c = 0; c < num_cycles; ++c ) { + timer.start(); + { + + // set vars + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + RAJA::forall(range_segment(0, var_size), [=] (int i) { + var[i] = i + v; + }); + } + + // _halo_exchange_loop_workgroup_packing_start + for (int l = 0; l < num_neighbors; ++l) { + + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; + + // pack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + pool_pack.enqueue(range_segment(0, len), [=] (int i) { + buffer[i] = var[list[i]]; + }); + + buffer += len; + } + } + + workgroup group_pack = pool_pack.instantiate(); + + worksite site_pack = group_pack.run(); + + // send all messages + // _halo_exchange_loop_workgroup_packing_end + + // _halo_exchange_loop_workgroup_unpacking_start + // recv all messages + + for (int l = 0; l < num_neighbors; ++l) { + + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; + + // unpack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + pool_unpack.enqueue(range_segment(0, len), [=] (int i) { + var[list[i]] = buffer[i]; + }); + + buffer += len; + } + } + + workgroup group_unpack = pool_unpack.instantiate(); + + worksite site_unpack = group_unpack.run(); + // _halo_exchange_loop_workgroup_unpacking_end + + } + timer.stop(); + + RAJA::Timer::ElapsedType tCycle = timer.elapsed(); + if (tCycle < minCycle) minCycle = tCycle; + timer.reset(); + } + + for (int l = 0; l < num_neighbors; ++l) { + + memoryManager::deallocate(buffers[l]); + + } + + std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + + // check results against reference copy + checkResult(vars, vars_ref, var_size, num_vars); + //printResult(vars, var_size, num_vars); + } + + +//----------------------------------------------------------------------------// + + +#if defined(RAJA_ENABLE_OPENMP) + +//----------------------------------------------------------------------------// +// Separate packing/unpacking loops using forall +//----------------------------------------------------------------------------// + { + std::cout << "\n Running RAJA Openmp forall halo exchange...\n"; + + double minCycle = std::numeric_limits::max(); + + // _halo_exchange_openmp_forall_policies_start + using forall_policy = RAJA::omp_parallel_for_exec; + // _halo_exchange_openmp_forall_policies_end + + std::vector buffers(num_neighbors, nullptr); + + for (int l = 0; l < num_neighbors; ++l) { + + int buffer_len = num_vars * pack_index_list_lengths[l]; + + buffers[l] = memoryManager::allocate(buffer_len); + + } + + for (int c = 0; c < num_cycles; ++c ) { + timer.start(); + { + + // set vars + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + RAJA::forall(range_segment(0, var_size), [=] (int i) { + var[i] = i + v; + }); + } + + // _halo_exchange_openmp_forall_packing_start + for (int l = 0; l < num_neighbors; ++l) { + + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; + + // pack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + RAJA::forall(range_segment(0, len), [=] (int i) { + buffer[i] = var[list[i]]; + }); + + buffer += len; + } + + // send single message + } + // _halo_exchange_openmp_forall_packing_end + + // _halo_exchange_openmp_forall_unpacking_start + for (int l = 0; l < num_neighbors; ++l) { + + // recv single message + + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; + + // unpack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + RAJA::forall(range_segment(0, len), [=] (int i) { + var[list[i]] = buffer[i]; + }); + + buffer += len; + } + } + // _halo_exchange_openmp_forall_unpacking_end + + } + timer.stop(); + + RAJA::Timer::ElapsedType tCycle = timer.elapsed(); + if (tCycle < minCycle) minCycle = tCycle; + timer.reset(); + } + + for (int l = 0; l < num_neighbors; ++l) { + + memoryManager::deallocate(buffers[l]); + + } + + std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + + // check results against reference copy + checkResult(vars, vars_ref, var_size, num_vars); + //printResult(vars, var_size, num_vars); + } + + +//----------------------------------------------------------------------------// +// RAJA::WorkGroup may allow effective parallelism across loops with Openmp. +//----------------------------------------------------------------------------// + { + std::cout << "\n Running RAJA OpenMP workgroup halo exchange...\n"; + + double minCycle = std::numeric_limits::max(); + + // _halo_exchange_openmp_workgroup_policies_start + using forall_policy = RAJA::omp_parallel_for_exec; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::omp_work, + RAJA::ordered, + RAJA::ragged_array_of_objects >; + + using workpool = RAJA::WorkPool< workgroup_policy, + int, + RAJA::xargs<>, + memory_manager_allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + int, + RAJA::xargs<>, + memory_manager_allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + int, + RAJA::xargs<>, + memory_manager_allocator >; + // _halo_exchange_openmp_workgroup_policies_end + + std::vector buffers(num_neighbors, nullptr); + + for (int l = 0; l < num_neighbors; ++l) { + + int buffer_len = num_vars * pack_index_list_lengths[l]; + + buffers[l] = memoryManager::allocate(buffer_len); + + } + + workpool pool_pack (memory_manager_allocator{}); + workpool pool_unpack(memory_manager_allocator{}); + + for (int c = 0; c < num_cycles; ++c ) { + timer.start(); + { + + // set vars + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + RAJA::forall(range_segment(0, var_size), [=] (int i) { + var[i] = i + v; + }); + } + + // _halo_exchange_openmp_workgroup_packing_start + for (int l = 0; l < num_neighbors; ++l) { + + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; + + // pack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + pool_pack.enqueue(range_segment(0, len), [=] (int i) { + buffer[i] = var[list[i]]; + }); + + buffer += len; + } + } + + workgroup group_pack = pool_pack.instantiate(); + + worksite site_pack = group_pack.run(); + + // send all messages + // _halo_exchange_openmp_workgroup_packing_end + + // _halo_exchange_openmp_workgroup_unpacking_start + // recv all messages + + for (int l = 0; l < num_neighbors; ++l) { + + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; + + // unpack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + pool_unpack.enqueue(range_segment(0, len), [=] (int i) { + var[list[i]] = buffer[i]; + }); + + buffer += len; + } + } + + workgroup group_unpack = pool_unpack.instantiate(); + + worksite site_unpack = group_unpack.run(); + // _halo_exchange_openmp_workgroup_unpacking_end + + } + timer.stop(); + + RAJA::Timer::ElapsedType tCycle = timer.elapsed(); + if (tCycle < minCycle) minCycle = tCycle; + timer.reset(); + } + + for (int l = 0; l < num_neighbors; ++l) { + + memoryManager::deallocate(buffers[l]); + + } + + std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + + // check results against reference copy + checkResult(vars, vars_ref, var_size, num_vars); + //printResult(vars, var_size, num_vars); + } + +#endif + + +//----------------------------------------------------------------------------// + + +#if defined(RAJA_ENABLE_CUDA) + +//----------------------------------------------------------------------------// +// Separate packing/unpacking loops using forall +//----------------------------------------------------------------------------// + { + std::cout << "\n Running RAJA Cuda forall halo exchange...\n"; + + double minCycle = std::numeric_limits::max(); + + + std::vector cuda_vars(num_vars, nullptr); + std::vector cuda_pack_index_lists(num_neighbors, nullptr); + std::vector cuda_unpack_index_lists(num_neighbors, nullptr); + + for (int v = 0; v < num_vars; ++v) { + cuda_vars[v] = memoryManager::allocate_gpu(var_size); + } + + for (int l = 0; l < num_neighbors; ++l) { + int pack_len = pack_index_list_lengths[l]; + cuda_pack_index_lists[l] = memoryManager::allocate_gpu(pack_len); + cudaErrchk(cudaMemcpy( cuda_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), cudaMemcpyDefault )); + + int unpack_len = unpack_index_list_lengths[l]; + cuda_unpack_index_lists[l] = memoryManager::allocate_gpu(unpack_len); + cudaErrchk(cudaMemcpy( cuda_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), cudaMemcpyDefault )); + } + + std::swap(vars, cuda_vars); + std::swap(pack_index_lists, cuda_pack_index_lists); + std::swap(unpack_index_lists, cuda_unpack_index_lists); + + + // _halo_exchange_cuda_forall_policies_start + using forall_policy = RAJA::cuda_exec_async; + // _halo_exchange_cuda_forall_policies_end + + std::vector buffers(num_neighbors, nullptr); + + for (int l = 0; l < num_neighbors; ++l) { + + int buffer_len = num_vars * pack_index_list_lengths[l]; + + buffers[l] = memoryManager::allocate_gpu(buffer_len); + + } + + for (int c = 0; c < num_cycles; ++c ) { + timer.start(); + { + + // set vars + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + RAJA::forall(range_segment(0, var_size), [=] RAJA_DEVICE (int i) { + var[i] = i + v; + }); + } + + // _halo_exchange_cuda_forall_packing_start + for (int l = 0; l < num_neighbors; ++l) { + + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; + + // pack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + RAJA::forall(range_segment(0, len), [=] RAJA_DEVICE (int i) { + buffer[i] = var[list[i]]; + }); + + buffer += len; + } + + cudaErrchk(cudaDeviceSynchronize()); + + // send single message + } + // _halo_exchange_cuda_forall_packing_end + + // _halo_exchange_cuda_forall_unpacking_start + for (int l = 0; l < num_neighbors; ++l) { + + // recv single message + + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; + + // unpack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + RAJA::forall(range_segment(0, len), [=] RAJA_DEVICE (int i) { + var[list[i]] = buffer[i]; + }); + + buffer += len; + } + } + + cudaErrchk(cudaDeviceSynchronize()); + // _halo_exchange_cuda_forall_unpacking_end + + } + timer.stop(); + + RAJA::Timer::ElapsedType tCycle = timer.elapsed(); + if (tCycle < minCycle) minCycle = tCycle; + timer.reset(); + } + + for (int l = 0; l < num_neighbors; ++l) { + + memoryManager::deallocate_gpu(buffers[l]); + + } + + + std::swap(vars, cuda_vars); + std::swap(pack_index_lists, cuda_pack_index_lists); + std::swap(unpack_index_lists, cuda_unpack_index_lists); + + for (int v = 0; v < num_vars; ++v) { + cudaErrchk(cudaMemcpy( vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault )); + memoryManager::deallocate_gpu(cuda_vars[v]); + } + + for (int l = 0; l < num_neighbors; ++l) { + memoryManager::deallocate_gpu(cuda_pack_index_lists[l]); + memoryManager::deallocate_gpu(cuda_unpack_index_lists[l]); + } + + + std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + + // check results against reference copy + checkResult(vars, vars_ref, var_size, num_vars); + //printResult(vars, var_size, num_vars); + } + + +//----------------------------------------------------------------------------// +// RAJA::WorkGroup with cuda_work allows deferred kernel fusion execution +//----------------------------------------------------------------------------// + { + std::cout << "\n Running RAJA Cuda workgroup halo exchange...\n"; + + double minCycle = std::numeric_limits::max(); + + + std::vector cuda_vars(num_vars, nullptr); + std::vector cuda_pack_index_lists(num_neighbors, nullptr); + std::vector cuda_unpack_index_lists(num_neighbors, nullptr); + + for (int v = 0; v < num_vars; ++v) { + cuda_vars[v] = memoryManager::allocate_gpu(var_size); + } + + for (int l = 0; l < num_neighbors; ++l) { + int pack_len = pack_index_list_lengths[l]; + cuda_pack_index_lists[l] = memoryManager::allocate_gpu(pack_len); + cudaErrchk(cudaMemcpy( cuda_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), cudaMemcpyDefault )); + + int unpack_len = unpack_index_list_lengths[l]; + cuda_unpack_index_lists[l] = memoryManager::allocate_gpu(unpack_len); + cudaErrchk(cudaMemcpy( cuda_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), cudaMemcpyDefault )); + } + + std::swap(vars, cuda_vars); + std::swap(pack_index_lists, cuda_pack_index_lists); + std::swap(unpack_index_lists, cuda_unpack_index_lists); + + + // _halo_exchange_cuda_workgroup_policies_start + using forall_policy = RAJA::cuda_exec_async; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::cuda_work_async, + RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects >; + + using workpool = RAJA::WorkPool< workgroup_policy, + int, + RAJA::xargs<>, + pinned_allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + int, + RAJA::xargs<>, + pinned_allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + int, + RAJA::xargs<>, + pinned_allocator >; + // _halo_exchange_cuda_workgroup_policies_end + + std::vector buffers(num_neighbors, nullptr); + + for (int l = 0; l < num_neighbors; ++l) { + + int buffer_len = num_vars * pack_index_list_lengths[l]; + + buffers[l] = memoryManager::allocate_gpu(buffer_len); + + } + + workpool pool_pack (pinned_allocator{}); + workpool pool_unpack(pinned_allocator{}); + + for (int c = 0; c < num_cycles; ++c ) { + timer.start(); + { + + // set vars + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + RAJA::forall(range_segment(0, var_size), [=] RAJA_DEVICE (int i) { + var[i] = i + v; + }); + } + + // _halo_exchange_cuda_workgroup_packing_start + for (int l = 0; l < num_neighbors; ++l) { + + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; + + // pack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) { + buffer[i] = var[list[i]]; + }); + + buffer += len; + } + } + + workgroup group_pack = pool_pack.instantiate(); + + worksite site_pack = group_pack.run(); + + cudaErrchk(cudaDeviceSynchronize()); + + // send all messages + // _halo_exchange_cuda_workgroup_packing_end + + // _halo_exchange_cuda_workgroup_unpacking_start + // recv all messages + + for (int l = 0; l < num_neighbors; ++l) { + + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; + + // unpack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) { + var[list[i]] = buffer[i]; + }); + + buffer += len; + } + } + + workgroup group_unpack = pool_unpack.instantiate(); + + worksite site_unpack = group_unpack.run(); + + cudaErrchk(cudaDeviceSynchronize()); + // _halo_exchange_cuda_workgroup_unpacking_end + + } + timer.stop(); + + RAJA::Timer::ElapsedType tCycle = timer.elapsed(); + if (tCycle < minCycle) minCycle = tCycle; + timer.reset(); + } + + for (int l = 0; l < num_neighbors; ++l) { + + memoryManager::deallocate_gpu(buffers[l]); + + } + + + std::swap(vars, cuda_vars); + std::swap(pack_index_lists, cuda_pack_index_lists); + std::swap(unpack_index_lists, cuda_unpack_index_lists); + + for (int v = 0; v < num_vars; ++v) { + cudaErrchk(cudaMemcpy( vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault )); + memoryManager::deallocate_gpu(cuda_vars[v]); + } + + for (int l = 0; l < num_neighbors; ++l) { + memoryManager::deallocate_gpu(cuda_pack_index_lists[l]); + memoryManager::deallocate_gpu(cuda_unpack_index_lists[l]); + } + + + std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + + // check results against reference copy + checkResult(vars, vars_ref, var_size, num_vars); + //printResult(vars, var_size, num_vars); + } + +#endif + + +//----------------------------------------------------------------------------// + + +#if defined(RAJA_ENABLE_HIP) + +//----------------------------------------------------------------------------// +// Separate packing/unpacking loops using forall +//----------------------------------------------------------------------------// + { + std::cout << "\n Running RAJA Hip forall halo exchange...\n"; + + double minCycle = std::numeric_limits::max(); + + + std::vector hip_vars(num_vars, nullptr); + std::vector hip_pack_index_lists(num_neighbors, nullptr); + std::vector hip_unpack_index_lists(num_neighbors, nullptr); + + for (int v = 0; v < num_vars; ++v) { + hip_vars[v] = memoryManager::allocate_gpu(var_size); + } + + for (int l = 0; l < num_neighbors; ++l) { + int pack_len = pack_index_list_lengths[l]; + hip_pack_index_lists[l] = memoryManager::allocate_gpu(pack_len); + hipErrchk(hipMemcpy( hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), hipMemcpyHostToDevice )); + + int unpack_len = unpack_index_list_lengths[l]; + hip_unpack_index_lists[l] = memoryManager::allocate_gpu(unpack_len); + hipErrchk(hipMemcpy( hip_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), hipMemcpyHostToDevice )); + } + + std::swap(vars, hip_vars); + std::swap(pack_index_lists, hip_pack_index_lists); + std::swap(unpack_index_lists, hip_unpack_index_lists); + + + // _halo_exchange_hip_forall_policies_start + using forall_policy = RAJA::hip_exec_async; + // _halo_exchange_hip_forall_policies_end + + std::vector buffers(num_neighbors, nullptr); + + for (int l = 0; l < num_neighbors; ++l) { + + int buffer_len = num_vars * pack_index_list_lengths[l]; + + buffers[l] = memoryManager::allocate_gpu(buffer_len); + + } + + for (int c = 0; c < num_cycles; ++c ) { + timer.start(); + { + + // set vars + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + RAJA::forall(range_segment(0, var_size), [=] RAJA_DEVICE (int i) { + var[i] = i + v; + }); + } + + // _halo_exchange_hip_forall_packing_start + for (int l = 0; l < num_neighbors; ++l) { + + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; + + // pack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + RAJA::forall(range_segment(0, len), [=] RAJA_DEVICE (int i) { + buffer[i] = var[list[i]]; + }); + + buffer += len; + } + + hipErrchk(hipDeviceSynchronize()); + + // send single message + } + // _halo_exchange_hip_forall_packing_end + + // _halo_exchange_hip_forall_unpacking_start + for (int l = 0; l < num_neighbors; ++l) { + + // recv single message + + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; + + // unpack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + RAJA::forall(range_segment(0, len), [=] RAJA_DEVICE (int i) { + var[list[i]] = buffer[i]; + }); + + buffer += len; + } + } + + hipErrchk(hipDeviceSynchronize()); + // _halo_exchange_hip_forall_unpacking_end + + } + timer.stop(); + + RAJA::Timer::ElapsedType tCycle = timer.elapsed(); + if (tCycle < minCycle) minCycle = tCycle; + timer.reset(); + } + + for (int l = 0; l < num_neighbors; ++l) { + + memoryManager::deallocate_gpu(buffers[l]); + + } + + + std::swap(vars, hip_vars); + std::swap(pack_index_lists, hip_pack_index_lists); + std::swap(unpack_index_lists, hip_unpack_index_lists); + + for (int v = 0; v < num_vars; ++v) { + hipErrchk(hipMemcpy( vars[v], hip_vars[v], var_size * sizeof(double), hipMemcpyDeviceToHost )); + memoryManager::deallocate_gpu(hip_vars[v]); + } + + for (int l = 0; l < num_neighbors; ++l) { + memoryManager::deallocate_gpu(hip_pack_index_lists[l]); + memoryManager::deallocate_gpu(hip_unpack_index_lists[l]); + } + + + std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + + // check results against reference copy + checkResult(vars, vars_ref, var_size, num_vars); + //printResult(vars, var_size, num_vars); + } + + +//----------------------------------------------------------------------------// +// RAJA::WorkGroup with hip_work allows deferred kernel fusion execution +//----------------------------------------------------------------------------// + { + std::cout << "\n Running RAJA Hip workgroup halo exchange...\n"; + + double minCycle = std::numeric_limits::max(); + + + std::vector hip_vars(num_vars, nullptr); + std::vector hip_pack_index_lists(num_neighbors, nullptr); + std::vector hip_unpack_index_lists(num_neighbors, nullptr); + + for (int v = 0; v < num_vars; ++v) { + hip_vars[v] = memoryManager::allocate_gpu(var_size); + } + + for (int l = 0; l < num_neighbors; ++l) { + int pack_len = pack_index_list_lengths[l]; + hip_pack_index_lists[l] = memoryManager::allocate_gpu(pack_len); + hipErrchk(hipMemcpy( hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), hipMemcpyHostToDevice )); + + int unpack_len = unpack_index_list_lengths[l]; + hip_unpack_index_lists[l] = memoryManager::allocate_gpu(unpack_len); + hipErrchk(hipMemcpy( hip_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), hipMemcpyHostToDevice )); + } + + std::swap(vars, hip_vars); + std::swap(pack_index_lists, hip_pack_index_lists); + std::swap(unpack_index_lists, hip_unpack_index_lists); + + + // _halo_exchange_hip_workgroup_policies_start + using forall_policy = RAJA::hip_exec_async; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::hip_work_async, +#if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL) + RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, +#else + RAJA::ordered, +#endif + RAJA::constant_stride_array_of_objects >; + + using workpool = RAJA::WorkPool< workgroup_policy, + int, + RAJA::xargs<>, + pinned_allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + int, + RAJA::xargs<>, + pinned_allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + int, + RAJA::xargs<>, + pinned_allocator >; + // _halo_exchange_hip_workgroup_policies_end + + std::vector buffers(num_neighbors, nullptr); + + for (int l = 0; l < num_neighbors; ++l) { + + int buffer_len = num_vars * pack_index_list_lengths[l]; + + buffers[l] = memoryManager::allocate_gpu(buffer_len); + + } + + workpool pool_pack (pinned_allocator{}); + workpool pool_unpack(pinned_allocator{}); + + for (int c = 0; c < num_cycles; ++c ) { + timer.start(); + { + + // set vars + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + RAJA::forall(range_segment(0, var_size), [=] RAJA_DEVICE (int i) { + var[i] = i + v; + }); + } + + // _halo_exchange_hip_workgroup_packing_start + for (int l = 0; l < num_neighbors; ++l) { + + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; + + // pack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) { + buffer[i] = var[list[i]]; + }); + + buffer += len; + } + } + + workgroup group_pack = pool_pack.instantiate(); + + worksite site_pack = group_pack.run(); + + hipErrchk(hipDeviceSynchronize()); + + // send all messages + // _halo_exchange_hip_workgroup_packing_end + + // _halo_exchange_hip_workgroup_unpacking_start + // recv all messages + + for (int l = 0; l < num_neighbors; ++l) { + + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; + + // unpack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) { + var[list[i]] = buffer[i]; + }); + + buffer += len; + } + } + + workgroup group_unpack = pool_unpack.instantiate(); + + worksite site_unpack = group_unpack.run(); + + hipErrchk(hipDeviceSynchronize()); + // _halo_exchange_hip_workgroup_unpacking_end + + } + timer.stop(); + + RAJA::Timer::ElapsedType tCycle = timer.elapsed(); + if (tCycle < minCycle) minCycle = tCycle; + timer.reset(); + } + + for (int l = 0; l < num_neighbors; ++l) { + + memoryManager::deallocate_gpu(buffers[l]); + + } + + + std::swap(vars, hip_vars); + std::swap(pack_index_lists, hip_pack_index_lists); + std::swap(unpack_index_lists, hip_unpack_index_lists); + + for (int v = 0; v < num_vars; ++v) { + hipErrchk(hipMemcpy( vars[v], hip_vars[v], var_size * sizeof(double), hipMemcpyDeviceToHost )); + memoryManager::deallocate_gpu(hip_vars[v]); + } + + for (int l = 0; l < num_neighbors; ++l) { + memoryManager::deallocate_gpu(hip_pack_index_lists[l]); + memoryManager::deallocate_gpu(hip_unpack_index_lists[l]); + } + + + std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + + // check results against reference copy + checkResult(vars, vars_ref, var_size, num_vars); + //printResult(vars, var_size, num_vars); + } + +#endif + + +//----------------------------------------------------------------------------// + + +// +// Clean up. +// + for (int v = 0; v < num_vars; ++v) { + memoryManager::deallocate(vars[v]); + memoryManager::deallocate(vars_ref[v]); + } + + destroy_pack_lists(pack_index_lists); + destroy_unpack_lists(unpack_index_lists); + + + std::cout << "\n DONE!...\n"; + + return 0; +} + + +// +// Function to compare result to reference and report P/F. +// +void checkResult(std::vector const& vars, std::vector const& vars_ref, + int var_size, int num_vars) +{ + bool correct = true; + for (int v = 0; v < num_vars; ++v) { + double* var = vars[v]; + double* var_ref = vars_ref[v]; + for (int i = 0; i < var_size; i++) { + if ( var[i] != var_ref[i] ) { correct = false; } + } + } + if ( correct ) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +} + +// +// Function to print result. +// +void printResult(std::vector const& vars, int var_size, int num_vars) +{ + std::cout << std::endl; + for (int v = 0; v < num_vars; ++v) { + double* var = vars[v]; + for (int i = 0; i < var_size; i++) { + std::cout << "result[" << i << "] = " << var[i] << std::endl; + } + } + std::cout << std::endl; +} + + +struct Extent +{ + int i_min; + int i_max; + int j_min; + int j_max; + int k_min; + int k_max; +}; + +// +// Function to generate index lists for packing. +// +void create_pack_lists(std::vector& pack_index_lists, + std::vector& pack_index_list_lengths, + const int halo_width, const int* grid_dims) +{ + std::vector pack_index_list_extents(num_neighbors); + + // faces + pack_index_list_extents[0] = Extent{halo_width , halo_width + halo_width, + halo_width , grid_dims[1] + halo_width, + halo_width , grid_dims[2] + halo_width}; + pack_index_list_extents[1] = Extent{grid_dims[0], grid_dims[0] + halo_width, + halo_width , grid_dims[1] + halo_width, + halo_width , grid_dims[2] + halo_width}; + pack_index_list_extents[2] = Extent{halo_width , grid_dims[0] + halo_width, + halo_width , halo_width + halo_width, + halo_width , grid_dims[2] + halo_width}; + pack_index_list_extents[3] = Extent{halo_width , grid_dims[0] + halo_width, + grid_dims[1], grid_dims[1] + halo_width, + halo_width , grid_dims[2] + halo_width}; + pack_index_list_extents[4] = Extent{halo_width , grid_dims[0] + halo_width, + halo_width , grid_dims[1] + halo_width, + halo_width , halo_width + halo_width}; + pack_index_list_extents[5] = Extent{halo_width , grid_dims[0] + halo_width, + halo_width , grid_dims[1] + halo_width, + grid_dims[2], grid_dims[2] + halo_width}; + + // edges + pack_index_list_extents[6] = Extent{halo_width , halo_width + halo_width, + halo_width , halo_width + halo_width, + halo_width , grid_dims[2] + halo_width}; + pack_index_list_extents[7] = Extent{halo_width , halo_width + halo_width, + grid_dims[1], grid_dims[1] + halo_width, + halo_width , grid_dims[2] + halo_width}; + pack_index_list_extents[8] = Extent{grid_dims[0], grid_dims[0] + halo_width, + halo_width , halo_width + halo_width, + halo_width , grid_dims[2] + halo_width}; + pack_index_list_extents[9] = Extent{grid_dims[0], grid_dims[0] + halo_width, + grid_dims[1], grid_dims[1] + halo_width, + halo_width , grid_dims[2] + halo_width}; + pack_index_list_extents[10] = Extent{halo_width , halo_width + halo_width, + halo_width , grid_dims[1] + halo_width, + halo_width , halo_width + halo_width}; + pack_index_list_extents[11] = Extent{halo_width , halo_width + halo_width, + halo_width , grid_dims[1] + halo_width, + grid_dims[2], grid_dims[2] + halo_width}; + pack_index_list_extents[12] = Extent{grid_dims[0], grid_dims[0] + halo_width, + halo_width , grid_dims[1] + halo_width, + halo_width , halo_width + halo_width}; + pack_index_list_extents[13] = Extent{grid_dims[0], grid_dims[0] + halo_width, + halo_width , grid_dims[1] + halo_width, + grid_dims[2], grid_dims[2] + halo_width}; + pack_index_list_extents[14] = Extent{halo_width , grid_dims[0] + halo_width, + halo_width , halo_width + halo_width, + halo_width , halo_width + halo_width}; + pack_index_list_extents[15] = Extent{halo_width , grid_dims[0] + halo_width, + halo_width , halo_width + halo_width, + grid_dims[2], grid_dims[2] + halo_width}; + pack_index_list_extents[16] = Extent{halo_width , grid_dims[0] + halo_width, + grid_dims[1], grid_dims[1] + halo_width, + halo_width , halo_width + halo_width}; + pack_index_list_extents[17] = Extent{halo_width , grid_dims[0] + halo_width, + grid_dims[1], grid_dims[1] + halo_width, + grid_dims[2], grid_dims[2] + halo_width}; + + // corners + pack_index_list_extents[18] = Extent{halo_width , halo_width + halo_width, + halo_width , halo_width + halo_width, + halo_width , halo_width + halo_width}; + pack_index_list_extents[19] = Extent{halo_width , halo_width + halo_width, + halo_width , halo_width + halo_width, + grid_dims[2], grid_dims[2] + halo_width}; + pack_index_list_extents[20] = Extent{halo_width , halo_width + halo_width, + grid_dims[1], grid_dims[1] + halo_width, + halo_width , halo_width + halo_width}; + pack_index_list_extents[21] = Extent{halo_width , halo_width + halo_width, + grid_dims[1], grid_dims[1] + halo_width, + grid_dims[2], grid_dims[2] + halo_width}; + pack_index_list_extents[22] = Extent{grid_dims[0], grid_dims[0] + halo_width, + halo_width , halo_width + halo_width, + halo_width , halo_width + halo_width}; + pack_index_list_extents[23] = Extent{grid_dims[0], grid_dims[0] + halo_width, + halo_width , halo_width + halo_width, + grid_dims[2], grid_dims[2] + halo_width}; + pack_index_list_extents[24] = Extent{grid_dims[0], grid_dims[0] + halo_width, + grid_dims[1], grid_dims[1] + halo_width, + halo_width , halo_width + halo_width}; + pack_index_list_extents[25] = Extent{grid_dims[0], grid_dims[0] + halo_width, + grid_dims[1], grid_dims[1] + halo_width, + grid_dims[2], grid_dims[2] + halo_width}; + + const int grid_i_stride = 1; + const int grid_j_stride = grid_dims[0] + 2*halo_width; + const int grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); + + for (int l = 0; l < num_neighbors; ++l) { + + Extent extent = pack_index_list_extents[l]; + + pack_index_list_lengths[l] = (extent.i_max - extent.i_min) * + (extent.j_max - extent.j_min) * + (extent.k_max - extent.k_min) ; + + pack_index_lists[l] = memoryManager::allocate(pack_index_list_lengths[l]); + + int* pack_list = pack_index_lists[l]; + + int list_idx = 0; + for (int kk = extent.k_min; kk < extent.k_max; ++kk) { + for (int jj = extent.j_min; jj < extent.j_max; ++jj) { + for (int ii = extent.i_min; ii < extent.i_max; ++ii) { + + int pack_idx = ii * grid_i_stride + + jj * grid_j_stride + + kk * grid_k_stride ; + + pack_list[list_idx] = pack_idx; + + list_idx += 1; + } + } + } + } +} + +// +// Function to destroy packing index lists. +// +void destroy_pack_lists(std::vector& pack_index_lists) +{ + for (int l = 0; l < num_neighbors; ++l) { + memoryManager::deallocate(pack_index_lists[l]); + } +} + + +// +// Function to generate index lists for unpacking. +// +void create_unpack_lists(std::vector& unpack_index_lists, std::vector& unpack_index_list_lengths, + const int halo_width, const int* grid_dims) +{ + std::vector unpack_index_list_extents(num_neighbors); + + // faces + unpack_index_list_extents[0] = Extent{0 , halo_width, + halo_width , grid_dims[1] + halo_width, + halo_width , grid_dims[2] + halo_width}; + unpack_index_list_extents[1] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, + halo_width , grid_dims[1] + halo_width, + halo_width , grid_dims[2] + halo_width}; + unpack_index_list_extents[2] = Extent{halo_width , grid_dims[0] + halo_width, + 0 , halo_width, + halo_width , grid_dims[2] + halo_width}; + unpack_index_list_extents[3] = Extent{halo_width , grid_dims[0] + halo_width, + grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, + halo_width , grid_dims[2] + halo_width}; + unpack_index_list_extents[4] = Extent{halo_width , grid_dims[0] + halo_width, + halo_width , grid_dims[1] + halo_width, + 0 , halo_width}; + unpack_index_list_extents[5] = Extent{halo_width , grid_dims[0] + halo_width, + halo_width , grid_dims[1] + halo_width, + grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + + // edges + unpack_index_list_extents[6] = Extent{0 , halo_width, + 0 , halo_width, + halo_width , grid_dims[2] + halo_width}; + unpack_index_list_extents[7] = Extent{0 , halo_width, + grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, + halo_width , grid_dims[2] + halo_width}; + unpack_index_list_extents[8] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, + 0 , halo_width, + halo_width , grid_dims[2] + halo_width}; + unpack_index_list_extents[9] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, + grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, + halo_width , grid_dims[2] + halo_width}; + unpack_index_list_extents[10] = Extent{0 , halo_width, + halo_width , grid_dims[1] + halo_width, + 0 , halo_width}; + unpack_index_list_extents[11] = Extent{0 , halo_width, + halo_width , grid_dims[1] + halo_width, + grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + unpack_index_list_extents[12] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, + halo_width , grid_dims[1] + halo_width, + 0 , halo_width}; + unpack_index_list_extents[13] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, + halo_width , grid_dims[1] + halo_width, + grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + unpack_index_list_extents[14] = Extent{halo_width , grid_dims[0] + halo_width, + 0 , halo_width, + 0 , halo_width}; + unpack_index_list_extents[15] = Extent{halo_width , grid_dims[0] + halo_width, + 0 , halo_width, + grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + unpack_index_list_extents[16] = Extent{halo_width , grid_dims[0] + halo_width, + grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, + 0 , halo_width}; + unpack_index_list_extents[17] = Extent{halo_width , grid_dims[0] + halo_width, + grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, + grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + + // corners + unpack_index_list_extents[18] = Extent{0 , halo_width, + 0 , halo_width, + 0 , halo_width}; + unpack_index_list_extents[19] = Extent{0 , halo_width, + 0 , halo_width, + grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + unpack_index_list_extents[20] = Extent{0 , halo_width, + grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, + 0 , halo_width}; + unpack_index_list_extents[21] = Extent{0 , halo_width, + grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, + grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + unpack_index_list_extents[22] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, + 0 , halo_width, + 0 , halo_width}; + unpack_index_list_extents[23] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, + 0 , halo_width, + grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + unpack_index_list_extents[24] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, + grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, + 0 , halo_width}; + unpack_index_list_extents[25] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, + grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, + grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + + const int grid_i_stride = 1; + const int grid_j_stride = grid_dims[0] + 2*halo_width; + const int grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); + + for (int l = 0; l < num_neighbors; ++l) { + + Extent extent = unpack_index_list_extents[l]; + + unpack_index_list_lengths[l] = (extent.i_max - extent.i_min) * + (extent.j_max - extent.j_min) * + (extent.k_max - extent.k_min) ; + + unpack_index_lists[l] = memoryManager::allocate(unpack_index_list_lengths[l]); + + int* unpack_list = unpack_index_lists[l]; + + int list_idx = 0; + for (int kk = extent.k_min; kk < extent.k_max; ++kk) { + for (int jj = extent.j_min; jj < extent.j_max; ++jj) { + for (int ii = extent.i_min; ii < extent.i_max; ++ii) { + + int unpack_idx = ii * grid_i_stride + + jj * grid_j_stride + + kk * grid_k_stride ; + + unpack_list[list_idx] = unpack_idx; + + list_idx += 1; + } + } + } + } +} + +// +// Function to destroy unpacking index lists. +// +void destroy_unpack_lists(std::vector& unpack_index_lists) +{ + for (int l = 0; l < num_neighbors; ++l) { + memoryManager::deallocate(unpack_index_lists[l]); + } +} diff --git a/examples/tut_indexset-segments.cpp b/examples/tut_indexset-segments.cpp index 17a18c85fa..5750a8066e 100644 --- a/examples/tut_indexset-segments.cpp +++ b/examples/tut_indexset-segments.cpp @@ -15,6 +15,8 @@ #include "RAJA/RAJA.hpp" +#include "camp/resource.hpp" + /* * Index sets and Segments Example * @@ -127,6 +129,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //printResult(a, N); //----------------------------------------------------------------------------// +// Resource object used to construct list segment objects with indices +// living in host (CPU) memory. + + camp::resources::Resource host_res{camp::resources::Host()}; + + // // RAJA list segment version #1 // @@ -144,7 +152,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) idx.push_back(i); } - ListSegType idx_list( &idx[0], idx.size() ); + ListSegType idx_list( &idx[0], idx.size(), host_res ); RAJA::forall(idx_list, [=] (IdxType i) { a[i] += b[i] * c; @@ -168,7 +176,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_list_segment_daxpy_reverse_start std::reverse( idx.begin(), idx.end() ); - ListSegType idx_reverse_list( &idx[0], idx.size() ); + ListSegType idx_reverse_list( &idx[0], idx.size(), host_res ); RAJA::forall(idx_reverse_list, [=] (IdxType i) { a[i] += b[i] * c; @@ -267,7 +275,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) idx1.push_back(i); } - ListSegType idx1_list( &idx1[0], idx1.size() ); + ListSegType idx1_list( &idx1[0], idx1.size(), host_res ); RAJA::TypedIndexSet is3; is3.push_back( RAJA::RangeSegment(0, N/3) ); @@ -333,6 +341,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) + +// +// We create a new resource object and index set so that list segment +// indices live in CUDA deviec memory. +// + camp::resources::Resource cuda_res{camp::resources::Cuda()}; + + ListSegType idx1_list_cuda( &idx1[0], idx1.size(), cuda_res ); + + RAJA::TypedIndexSet is3_cuda; + is3_cuda.push_back( RAJA::RangeSegment(0, N/3) ); + is3_cuda.push_back( idx1_list_cuda ); + is3_cuda.push_back( RAJA::RangeSegment(2*N/3, N) ); + + std::cout << "\n Running RAJA index set (2 RangeSegments, 1 ListSegment) daxpy\n" << " (sequential iteration over segments, CUDA parallel segment execution)...\n"; @@ -344,7 +367,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memcpy( a, a0, N * sizeof(double) ); - RAJA::forall(is3, [=] RAJA_DEVICE (IdxType i) { + RAJA::forall(is3_cuda, [=] RAJA_DEVICE (IdxType i) { a[i] += b[i] * c; }); @@ -355,6 +378,20 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) + +// +// We create a new resource object and index set so that list segment +// indices live in Hip deviec memory. +// + camp::resources::Resource hip_res{camp::resources::Hip()}; + + ListSegType idx1_list_hip( &idx1[0], idx1.size(), hip_res ); + + RAJA::TypedIndexSet is3_hip; + is3_hip.push_back( RAJA::RangeSegment(0, N/3) ); + is3_hip.push_back( idx1_list_hip ); + is3_hip.push_back( RAJA::RangeSegment(2*N/3, N) ); + std::cout << "\n Running RAJA index set (2 RangeSegments, 1 ListSegment) daxpy\n" << " (sequential iteration over segments, HIP parallel segment execution)...\n"; @@ -368,7 +405,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) hipErrchk(hipMemcpy( d_a, a0, N * sizeof(double), hipMemcpyHostToDevice )); hipErrchk(hipMemcpy( d_b, b, N * sizeof(double), hipMemcpyHostToDevice )); - RAJA::forall(is3, [=] RAJA_DEVICE (IdxType i) { + RAJA::forall(is3_hip, [=] RAJA_DEVICE (IdxType i) { d_a[i] += d_b[i] * c; }); diff --git a/examples/tut_matrix-multiply.cpp b/examples/tut_matrix-multiply.cpp index ce9ceb750c..9154bd22df 100644 --- a/examples/tut_matrix-multiply.cpp +++ b/examples/tut_matrix-multiply.cpp @@ -472,8 +472,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using EXEC_POL5 = RAJA::KernelPolicy< RAJA::statement::CudaKernel< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, RAJA::cuda_block_x_loop, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_loop, RAJA::statement::For<1, RAJA::cuda_thread_y_loop, RAJA::statement::For<0, RAJA::cuda_thread_x_loop, RAJA::statement::Lambda<0> @@ -575,8 +575,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using EXEC_POL5 = RAJA::KernelPolicy< RAJA::statement::HipKernel< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, RAJA::hip_block_y_loop, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, RAJA::hip_block_x_loop, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_loop, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_loop, RAJA::statement::For<1, RAJA::hip_thread_y_loop, RAJA::statement::For<0, RAJA::hip_thread_x_loop, RAJA::statement::Lambda<0> @@ -634,11 +634,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::loop_exec, RAJA::statement::For<0, RAJA::loop_exec, - RAJA::statement::Lambda<0>, // dot = 0.0 + RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0 RAJA::statement::For<2, RAJA::loop_exec, RAJA::statement::Lambda<1> // inner loop: dot += ... >, - RAJA::statement::Lambda<2> // set C(row, col) = dot + RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set C(row, col) = dot > > >; @@ -649,7 +649,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::tuple{0.0}, // thread local variable for 'dot' // lambda 0 - [=] (int /* col */, int /* row */, int /* k */, double& dot) { + [=] (double& dot) { dot = 0.0; }, @@ -659,7 +659,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }, // lambda 2 - [=] (int col, int row, int /* k */, double& dot) { + [=] (int col, int row, double& dot) { Cview(row, col) = dot; } @@ -683,8 +683,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _matmult_3lambdakernel_args_seq_start // Alias for convenience - using RAJA::statement::Segs; - using RAJA::statement::Params; + using RAJA::Segs; + using RAJA::Params; using EXEC_POL6b = RAJA::KernelPolicy< @@ -738,11 +738,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::KernelPolicy< RAJA::statement::Collapse, // row, col - RAJA::statement::Lambda<0>, // dot = 0.0 + RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0 RAJA::statement::For<2, RAJA::loop_exec, RAJA::statement::Lambda<1> // inner loop: dot += ... >, - RAJA::statement::Lambda<2> // set C(row, col) = dot + RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set C(row, col) = dot > >; // _matmult_3lambdakernel_ompcollapse_end @@ -753,7 +753,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::tuple{0.0}, // thread local variable for 'dot' // lambda 0 - [=] (int /* col */, int /* row */, int /* k */, double& dot) { + [=] (double& dot) { dot = 0.0; }, @@ -763,7 +763,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }, // lambda 2 - [=] (int col, int row, int /* k */, double& dot) { + [=] (int col, int row, double& dot) { Cview(row, col) = dot; } @@ -787,11 +787,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::statement::CudaKernel< RAJA::statement::For<1, RAJA::cuda_block_x_loop, // row RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // col - RAJA::statement::Lambda<0>, // dot = 0.0 + RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0 RAJA::statement::For<2, RAJA::seq_exec, RAJA::statement::Lambda<1> // dot += ... >, - RAJA::statement::Lambda<2> // set C = ... + RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set C = ... > > > @@ -804,7 +804,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::tuple{0.0}, // thread local variable for 'dot' // lambda 0 - [=] RAJA_DEVICE (int /* col */, int /* row */, int /* k */, double& dot) { + [=] RAJA_DEVICE (double& dot) { dot = 0.0; }, @@ -814,7 +814,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }, // lambda 2 - [=] RAJA_DEVICE (int col, int row, int /* k */, double& dot) { + [=] RAJA_DEVICE (int col, int row, double& dot) { Cview(row, col) = dot; } @@ -833,15 +833,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using EXEC_POL9a = RAJA::KernelPolicy< RAJA::statement::CudaKernel< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, RAJA::cuda_block_x_loop, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_loop, RAJA::statement::For<1, RAJA::cuda_thread_y_loop, // row RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // col - RAJA::statement::Lambda<0>, // dot = 0.0 + RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0 RAJA::statement::For<2, RAJA::seq_exec, RAJA::statement::Lambda<1> // dot += ... >, - RAJA::statement::Lambda<2> // set C = ... + RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set C = ... > > > @@ -856,7 +856,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::tuple{0.0}, // thread local variable for 'dot' // lambda 0 - [=] RAJA_DEVICE (int /* col */, int /* row */, int /* k */, double& dot) { + [=] RAJA_DEVICE (double& dot) { dot = 0.0; }, @@ -866,7 +866,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }, // lambda 2 - [=] RAJA_DEVICE (int col, int row, int /* k */, double& dot) { + [=] RAJA_DEVICE (int col, int row, double& dot) { Cview(row, col) = dot; } @@ -884,8 +884,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using EXEC_POL9b = RAJA::KernelPolicy< RAJA::statement::CudaKernel< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, RAJA::cuda_block_x_loop, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_loop, RAJA::statement::For<1, RAJA::cuda_thread_y_loop, // row RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // col RAJA::statement::Lambda<0, Params<0>>, // dot = 0.0 @@ -967,11 +967,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::statement::HipKernel< RAJA::statement::For<1, RAJA::hip_block_x_loop, // row RAJA::statement::For<0, RAJA::hip_thread_x_loop, // col - RAJA::statement::Lambda<0>, // dot = 0.0 + RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0 RAJA::statement::For<2, RAJA::seq_exec, RAJA::statement::Lambda<1> // dot += ... >, - RAJA::statement::Lambda<2> // set C = ... + RAJA::statement::Lambda<2, + RAJA::Segs<0,1>, + RAJA::Params<0>> // set C = ... > > > @@ -983,7 +985,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::tuple{0.0}, // thread local variable for 'dot' // lambda 0 - [=] RAJA_DEVICE (int /* col */, int /* row */, int /* k */, double& dot) { + [=] RAJA_DEVICE (double& dot) { dot = 0.0; }, @@ -993,7 +995,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }, // lambda 2 - [=] RAJA_DEVICE (int col, int row, int /* k */, double& dot) { + [=] RAJA_DEVICE (int col, int row, double& dot) { d_Cview(row, col) = dot; } @@ -1003,61 +1005,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(Cview, N); //printResult(Cview, N); -//----------------------------------------------------------------------------// - - std::cout << "\n Running HIP mat-mult with multiple lambdas (RAJA-POL9a)...\n"; - - std::memset(C, 0, N*N * sizeof(double)); - hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); - - using EXEC_POL9a = - RAJA::KernelPolicy< - RAJA::statement::HipKernel< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, RAJA::hip_block_y_loop, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, RAJA::hip_block_x_loop, - RAJA::statement::For<1, RAJA::hip_thread_y_loop, // row - RAJA::statement::For<0, RAJA::hip_thread_x_loop, // col - RAJA::statement::Lambda<0>, // dot = 0.0 - RAJA::statement::For<2, RAJA::seq_exec, - RAJA::statement::Lambda<1> // dot += ... - >, - RAJA::statement::Lambda<2> // set C = ... - > - > - > - > - > - >; - - RAJA::kernel_param( - RAJA::make_tuple(col_range, row_range, dot_range), - - RAJA::tuple{0.0}, // thread local variable for 'dot' - - // lambda 0 - [=] RAJA_DEVICE (int /* col */, int /* row */, int /* k */, double& dot) { - dot = 0.0; - }, - - // lambda 1 - [=] RAJA_DEVICE (int col, int row, int k, double& dot) { - dot += d_Aview(row, k) * d_Bview(k, col); - }, - - // lambda 2 - [=] RAJA_DEVICE (int col, int row, int /* k */, double& dot) { - d_Cview(row, col) = dot; - } - - ); - - hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); - checkResult(Cview, N); -//printResult(Cview, N); //----------------------------------------------------------------------------// - std::cout << "\n Running HIP mat-mult with multiple lambdas - lambda args in statements (RAJA-POL9b)...\n"; + std::cout << "\n Running HIP mat-mult with multiple lambdas - lambda args in statements (RAJA-POL9)...\n"; std::memset(C, 0, N*N * sizeof(double)); hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); @@ -1065,8 +1016,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using EXEC_POL9b = RAJA::KernelPolicy< RAJA::statement::HipKernel< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, RAJA::hip_block_y_loop, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, RAJA::hip_block_x_loop, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_loop, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_loop, RAJA::statement::For<1, RAJA::hip_thread_y_loop, // row RAJA::statement::For<0, RAJA::hip_thread_x_loop, // col RAJA::statement::Lambda<0, Params<0>>, // dot = 0.0 diff --git a/examples/tut_matrix-transpose-local-array.cpp b/examples/tut_matrix-transpose-local-array.cpp index 610cc2c6a6..fbb1f508d6 100644 --- a/examples/tut_matrix-transpose-local-array.cpp +++ b/examples/tut_matrix-transpose-local-array.cpp @@ -202,8 +202,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _mattranspose_localarray_raja_start using SEQ_EXEC_POL_I = RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, RAJA::loop_exec, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, RAJA::loop_exec, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::loop_exec, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::loop_exec, RAJA::statement::InitLocalMem, @@ -248,15 +248,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); //Alias for convenience - using RAJA::statement::Segs; - using RAJA::statement::Offsets; - using RAJA::statement::Params; + using RAJA::Segs; + using RAJA::Offsets; + using RAJA::Params; // _mattranspose_localarray_raja_lambdaargs_start using SEQ_EXEC_POL_II = RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, RAJA::loop_exec, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, RAJA::loop_exec, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::loop_exec, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::loop_exec, RAJA::statement::InitLocalMem, @@ -309,8 +309,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // These loops iterate over the number of // tiles needed to carry out the transpose // - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, RAJA::omp_parallel_for_exec, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, RAJA::loop_exec, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::omp_parallel_for_exec, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::loop_exec, // This statement will initalize local array memory inside a // kernel. The cpu_tile_mem policy specifies that memory should be // allocated on the stack. The entries in the RAJA::ParamList @@ -376,8 +376,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // These loops iterate over the number of // tiles needed to carry out the transpose // - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, RAJA::loop_exec, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, RAJA::loop_exec, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::loop_exec, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::loop_exec, // This statement will initalize local array memory inside a // kernel. The cpu_tile_mem policy specifies that memory should be // allocated on the stack. The entries in the RAJA::ParamList @@ -445,8 +445,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // These loops iterate over the number of // tiles needed to carry out the transpose // - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, RAJA::cuda_block_x_loop, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_loop, // This statement will initalize local array memory inside a // kernel. The cpu_tile_mem policy specifies that memory should be // allocated on the stack. The entries in the RAJA::ParamList @@ -538,8 +538,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // These loops iterate over the number of // tiles needed to carry out the transpose // - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, RAJA::hip_block_y_loop, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, RAJA::hip_block_x_loop, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_loop, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_loop, // This statement will initalize local array memory inside a // kernel. The cpu_tile_mem policy specifies that memory should be // allocated on the stack. The entries in the RAJA::ParamList diff --git a/examples/tut_sort.cpp b/examples/tut_sort.cpp new file mode 100644 index 0000000000..02171b56ec --- /dev/null +++ b/examples/tut_sort.cpp @@ -0,0 +1,634 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" + +/* + * Sort Example + * + * Example shows how to perform RAJA unstable and stable sort operations + * for integer arrays, including pairs variant, using different comparators. + * Other array data types, comparators, etc. are similar + * + * RAJA features shown: + * - `RAJA::sort` and `RAJA::sort_pairs` methods + * - `RAJA::stable_sort` and `RAJA::stable_sort_pairs` methods + * - RAJA operators + * - Execution policies + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +/* + CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block +*/ +#if defined(RAJA_ENABLE_CUDA) +const int CUDA_BLOCK_SIZE = 16; +#endif + +#if defined(RAJA_ENABLE_HIP) +const int HIP_BLOCK_SIZE = 16; +#endif + +// +// Functions for checking results and printing vectors +// +template +void checkUnstableSortResult(const T* in, const T* out, int N); +template +void checkUnstableSortResult(const T* in, const T* out, + const U* in_vals, const U* out_vals, int N); +// +template +void checkStableSortResult(const T* in, const T* out, int N); +template +void checkStableSortResult(const T* in, const T* out, + const U* in_vals, const U* out_vals, int N); +// +template +void printArray(const T* k, int N); +template +void printArray(const T* k, const U* v, int N); + + +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA sort example...\n"; + + // _sort_array_init_start +// +// Define array length +// + const int N = 20; + +// +// Allocate and initialize vector data +// + int* in = memoryManager::allocate(N); + int* out = memoryManager::allocate(N); + + unsigned* in_vals = memoryManager::allocate(N); + unsigned* out_vals = memoryManager::allocate(N); + + std::iota(in , in + N/2, 0); + std::iota(in + N/2, in + N , 0); + std::shuffle(in , in + N/2, std::mt19937{12345u}); + std::shuffle(in + N/2, in + N , std::mt19937{67890u}); + + std::fill(in_vals , in_vals + N/2, 0); + std::fill(in_vals + N/2, in_vals + N , 1); + + // _sort_array_init_end + + std::cout << "\n in keys...\n"; + printArray(in, N); + std::cout << "\n in (key, value) pairs...\n"; + printArray(in, in_vals, N); + std::cout << "\n"; + + +//----------------------------------------------------------------------------// +// Perform various sequential sorts to illustrate unstable/stable, +// pairs, default sorts with different comparators +//----------------------------------------------------------------------------// + + std::cout << "\n Running sequential sort (default)...\n"; + + std::copy_n(in, N, out); + + // _sort_seq_start + RAJA::sort(out, out + N); + // _sort_seq_end + + checkUnstableSortResult>(in, out, N); + printArray(out, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running sequential sort (non-decreasing)...\n"; + + std::copy_n(in, N, out); + + // _sort_seq_less_start + RAJA::sort(out, out + N, + RAJA::operators::less{}); + // _sort_seq_less_end + + checkUnstableSortResult>(in, out, N); + printArray(out, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running sequential stable_sort (non-decreasing)...\n"; + + std::copy_n(in, N, out); + + // _sort_stable_seq_less_start + RAJA::stable_sort(out, out + N, + RAJA::operators::less{}); + // _sort_stable_seq_less_end + + checkStableSortResult>(in, out, N); + printArray(out, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running sequential stable_sort (non-decreasing)...\n"; + + std::copy_n(in, N, out); + + // _sort_stable_seq_greater_start + RAJA::stable_sort(out, out + N, + RAJA::operators::greater{}); + // _sort_stable_seq_greater_end + + checkStableSortResult>(in, out, N); + printArray(out, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running sequential sort_pairs (non-decreasing)...\n"; + + std::copy_n(in, N, out); + std::copy_n(in_vals, N, out_vals); + + // _sort_pairs_seq_less_start + RAJA::sort_pairs(out, out + N, out_vals, + RAJA::operators::less{}); + // _sort_pairs_seq_less_end + + checkUnstableSortResult>(in, out, in_vals, out_vals, N); + printArray(out, out_vals, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running sequential stable_sort_pairs (non-increasing)...\n"; + + std::copy_n(in, N, out); + std::copy_n(in_vals, N, out_vals); + + // _sort_stable_pairs_seq_greater_start + RAJA::stable_sort_pairs(out, out + N, out_vals, + RAJA::operators::greater{}); + // _sort_stable_pairs_seq_greater_end + + checkStableSortResult>(in, out, in_vals, out_vals, N); + printArray(out, out_vals, N); + std::cout << "\n"; + + +#if defined(RAJA_ENABLE_OPENMP) + +//----------------------------------------------------------------------------// +// Perform a couple of OpenMP sorts... +//----------------------------------------------------------------------------// + + std::cout << "\n Running OpenMP sort (non-decreasing)...\n"; + + std::copy_n(in, N, out); + + // _sort_omp_less_start + RAJA::sort(out, out + N, + RAJA::operators::less{}); + // _sort_omp_less_end + + checkUnstableSortResult>(in, out, N); + printArray(out, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running OpenMP stable_sort_pairs (non-increasing)...\n"; + + std::copy_n(in, N, out); + std::copy_n(in_vals, N, out_vals); + + // _sort_stable_pairs_omp_greater_start + RAJA::stable_sort_pairs(out, out + N, out_vals, + RAJA::operators::greater{}); + // _sort_stable_pairs_omp_greater_end + + checkStableSortResult>(in, out, in_vals, out_vals, N); + printArray(out, out_vals, N); + std::cout << "\n"; + +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) + +//----------------------------------------------------------------------------// +// Perform a couple of CUDA sorts... +//----------------------------------------------------------------------------// + + std::cout << "\n Running CUDA sort_pairs (non-increasing)...\n"; + + std::copy_n(in, N, out); + std::copy_n(in_vals, N, out_vals); + + // _sort_pairs_cuda_greater_start + RAJA::sort_pairs>(out, out + N, out_vals, + RAJA::operators::greater{}); + // _sort_pairs_cuda_greater_end + + checkUnstableSortResult>(in, out, in_vals, out_vals, N); + printArray(out, out_vals, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running CUDA stable_sort (non-decreasing)...\n"; + + std::copy_n(in, N, out); + + // _sort_stable_cuda_less_start + RAJA::stable_sort>(out, out + N, + RAJA::operators::less{}); + // _sort_stable_cuda_less_end + + checkStableSortResult>(in, out, N); + printArray(out, N); + std::cout << "\n"; + +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + +//----------------------------------------------------------------------------// +// Perform a couple of HIP sorts... +//----------------------------------------------------------------------------// + + std::cout << "\n Running HIP sort_pairs (non-decreasing)...\n"; + + std::copy_n(in, N, out); + std::copy_n(in_vals, N, out_vals); + + int* d_out = memoryManager::allocate_gpu(N); + int* d_out_vals = memoryManager::allocate_gpu(N); + + hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice )); + + RAJA::sort_pairs>(d_out, d_out + N, d_out_vals, + RAJA::operators::less{}); + + hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy( out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost )); + + checkUnstableSortResult>(in, out, in_vals, out_vals, N); + printArray(out, out_vals, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running HIP stable_sort (non-increasing)...\n"; + + std::copy_n(in, N, out); + + hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); + + RAJA::stable_sort>(d_out, d_out + N, + RAJA::operators::greater{}); + + hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); + + checkStableSortResult>(in, out, N); + printArray(out, N); + std::cout << "\n"; + + memoryManager::deallocate_gpu(d_out); + memoryManager::deallocate_gpu(d_out_vals); + +#endif + +//----------------------------------------------------------------------------// + +// +// Clean up. +// + memoryManager::deallocate(in); + memoryManager::deallocate(out); + + memoryManager::deallocate(in_vals); + memoryManager::deallocate(out_vals); + + std::cout << "\n DONE!...\n"; + + return 0; +} + +template +bool equivalent(T const& a, T const& b, Comparator comp) +{ + return !comp(a, b) && !comp(b, a); +} + +// +// Function to check unstable sort result +// +template +void checkUnstableSortResult(const T* in, const T* out, int N) +{ + Comparator comp; + bool correct = true; + + // make map of keys to keys + using val_map = std::unordered_multiset; + std::unordered_map keys; + for (RAJA::Index_type i = 0; i < N; i++) { + auto key_iter = keys.find(in[i]); + if (key_iter == keys.end()) { + auto ret = keys.emplace(in[i], val_map{}); + assert(ret.second); + key_iter = ret.first; + } + key_iter->second.emplace(in[i]); + } + + for (RAJA::Index_type i = 0; i < N; i++) { + // test ordering + if (i > 0 && comp(out[i], out[i-1])) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << out[i-1] << ", " << out[i] + << " out of order" + << " (at index " << i-1 << ")\n"; + } + // test there is an item with this + auto key_iter = keys.find(out[i]); + if (key_iter == keys.end()) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << out[i] + << " unknown or duplicate key" + << " (at index " << i << ")\n"; + } + auto val_iter = key_iter->second.find(out[i]); + if (val_iter == key_iter->second.end()) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << out[i] + << " unknown or duplicate val" + << " (at index " << i << ")\n"; + } + key_iter->second.erase(val_iter); + if (key_iter->second.size() == 0) { + keys.erase(key_iter); + } + } + if (correct) { + std::cout << "\n\t result -- CORRECT\n"; + } +} +/// +template +void checkUnstableSortResult(const T* in, const T* out, + const U* in_vals, const U* out_vals, int N) +{ + Comparator comp; + bool correct = true; + + // make map of keys to vals + using val_map = std::unordered_multiset; + std::unordered_map keys_to_vals; + for (RAJA::Index_type i = 0; i < N; i++) { + auto key_iter = keys_to_vals.find(in[i]); + if (key_iter == keys_to_vals.end()) { + auto ret = keys_to_vals.emplace(in[i], val_map{}); + assert(ret.second); + key_iter = ret.first; + } + key_iter->second.emplace(in_vals[i]); + } + + for (RAJA::Index_type i = 0; i < N; i++) { + // test ordering + if (i > 0 && comp(out[i], out[i-1])) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << "(" << out[i-1] << "," << out_vals[i-1] << ")," + << " (" << out[i] << "," << out_vals[i] << ")" + << " out of order" + << " (at index " << i-1 << ")\n"; + } + // test there is a pair with this key and val + auto key_iter = keys_to_vals.find(out[i]); + if (key_iter == keys_to_vals.end()) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << "(" << out[i] << "," << out_vals[i] << ")" + << " unknown or duplicate key" + << " (at index " << i << ")\n"; + } + auto val_iter = key_iter->second.find(out_vals[i]); + if (val_iter == key_iter->second.end()) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << "(" << out[i] << "," << out_vals[i] << ")" + << " unknown or duplicate val" + << " (at index " << i << ")\n"; + } + key_iter->second.erase(val_iter); + if (key_iter->second.size() == 0) { + keys_to_vals.erase(key_iter); + } + } + if (correct) { + std::cout << "\n\t result -- CORRECT\n"; + } +} + +// +// Function to check stable sort result +// +template +void checkStableSortResult(const T* in, const T* out, int N) +{ + Comparator comp; + bool correct = true; + + // make map of keys to keys + using val_map = std::list; + std::unordered_map keys; + for (RAJA::Index_type i = 0; i < N; i++) { + auto key_iter = keys.find(in[i]); + if (key_iter == keys.end()) { + auto ret = keys.emplace(in[i], val_map{}); + assert(ret.second); + key_iter = ret.first; + } + key_iter->second.emplace_back(in[i]); + } + + for (RAJA::Index_type i = 0; i < N; i++) { + // test ordering + if (i > 0 && comp(out[i], out[i-1])) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << out[i-1] << ", " << out[i] + << " out of order " + << " (at index " << i-1 << ")\n"; + } + // test there is an item with this + auto key_iter = keys.find(out[i]); + if (key_iter == keys.end()) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << out[i] + << " unknown or duplicate key " + << " (at index " << i << ")\n"; + } + if (key_iter->second.front() != out[i]) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << out[i] + << " out of stable order or unknown val " + << " (at index " << i << ")\n"; + } + key_iter->second.pop_front(); + if (key_iter->second.size() == 0) { + keys.erase(key_iter); + } + } + if (correct) { + std::cout << "\n\t result -- CORRECT\n"; + } +} +/// +template +void checkStableSortResult(const T* in, const T* out, + const U* in_vals, const U* out_vals, int N) +{ + Comparator comp; + bool correct = true; + + // make map of keys to vals + using val_map = std::list; + std::unordered_map keys_to_vals; + for (RAJA::Index_type i = 0; i < N; i++) { + auto key_iter = keys_to_vals.find(in[i]); + if (key_iter == keys_to_vals.end()) { + auto ret = keys_to_vals.emplace(in[i], val_map{}); + assert(ret.second); + key_iter = ret.first; + } + key_iter->second.emplace_back(in_vals[i]); + } + + for (RAJA::Index_type i = 0; i < N; i++) { + // test ordering + if (i > 0 && comp(out[i], out[i-1])) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << "(" << out[i-1] << "," << out_vals[i-1] << ")," + << " (" << out[i] << "," << out_vals[i] << ")" + << " out of order " + << " (at index " << i-1 << ")\n"; + } + // test there is a pair with this key and val + auto key_iter = keys_to_vals.find(out[i]); + if (key_iter == keys_to_vals.end()) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << "(" << out[i] << "," << out_vals[i] << ")" + << " unknown or duplicate key " + << " (at index " << i << ")\n"; + } + if (key_iter->second.front() != out_vals[i]) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << "(" << out[i] << "," << out_vals[i] << ")" + << " out of stable order or unknown val " + << " (at index " << i << ")\n"; + } + key_iter->second.pop_front(); + if (key_iter->second.size() == 0) { + keys_to_vals.erase(key_iter); + } + } + if (correct) { + std::cout << "\n\t result -- CORRECT\n"; + } +} + + +// +// Function to print vector. +// +template +void printArray(const T* k, int N) +{ + std::cout << std::endl; + for (int i = 0; i < N; ++i) { std::cout << " " << k[i]; } + std::cout << std::endl; +} +/// +template +void printArray(const T* k, const U* v, int N) +{ + std::cout << std::endl; + for (int i = 0; i < N; ++i) { std::cout << " (" << k[i] << "," << v[i] << ")"; } + std::cout << std::endl; +} + diff --git a/examples/tut_tiled-matrix-transpose.cpp b/examples/tut_tiled-matrix-transpose.cpp index 0ad0ebb85d..44c8fbc5b7 100644 --- a/examples/tut_tiled-matrix-transpose.cpp +++ b/examples/tut_tiled-matrix-transpose.cpp @@ -161,8 +161,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_tiled_mattranspose_start using KERNEL_EXEC_POL = RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, RAJA::seq_exec, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> @@ -193,8 +193,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // using KERNEL_EXEC_POL_OMP = RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, RAJA::seq_exec, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::For<1, RAJA::omp_parallel_for_exec, RAJA::statement::For<0, RAJA::loop_exec, RAJA::statement::Lambda<0> @@ -227,8 +227,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // using KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, RAJA::seq_exec, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::Collapse, RAJA::statement::Lambda<0> @@ -260,8 +260,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using KERNEL_EXEC_POL_CUDA = RAJA::KernelPolicy< RAJA::statement::CudaKernel< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, RAJA::cuda_block_x_loop, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_loop, RAJA::statement::For<1, RAJA::cuda_thread_x_direct, RAJA::statement::For<0, RAJA::cuda_thread_y_direct, RAJA::statement::Lambda<0> @@ -302,8 +302,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using KERNEL_EXEC_POL_HIP = RAJA::KernelPolicy< RAJA::statement::HipKernel< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, RAJA::hip_block_y_loop, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, RAJA::hip_block_x_loop, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_loop, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_loop, RAJA::statement::For<1, RAJA::hip_thread_x_direct, RAJA::statement::For<0, RAJA::hip_thread_y_direct, RAJA::statement::Lambda<0> diff --git a/examples/tut_vertexsum-coloring.cpp b/examples/tut_vertexsum-coloring.cpp index 3f4de68771..1612b2af0b 100644 --- a/examples/tut_vertexsum-coloring.cpp +++ b/examples/tut_vertexsum-coloring.cpp @@ -15,6 +15,8 @@ #include "RAJA/RAJA.hpp" +#include "camp/resource.hpp" + /* * Mesh Vertex Sum with Index Coloring Example * @@ -232,10 +234,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::TypedIndexSet colorset; - colorset.push_back( SegmentType(&idx0[0], idx0.size()) ); - colorset.push_back( SegmentType(&idx1[0], idx1.size()) ); - colorset.push_back( SegmentType(&idx2[0], idx2.size()) ); - colorset.push_back( SegmentType(&idx3[0], idx3.size()) ); + camp::resources::Resource host_res{camp::resources::Host()}; + + colorset.push_back( SegmentType(&idx0[0], idx0.size(), host_res) ); + colorset.push_back( SegmentType(&idx1[0], idx1.size(), host_res) ); + colorset.push_back( SegmentType(&idx2[0], idx2.size(), host_res) ); + colorset.push_back( SegmentType(&idx3[0], idx3.size(), host_res) ); // _colorindexset_vertexsum_end //----------------------------------------------------------------------------// @@ -307,13 +311,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // std::cout << "\n Running RAJA CUDA index set version...\n"; +// +// We create a RAJA TypedIndexSet with four ListSegments as before, +// but now we use a CUDA resource so the segment indices live in +// device memory. +// + RAJA::TypedIndexSet colorset_cuda; + + camp::resources::Resource cuda_res{camp::resources::Cuda()}; + + colorset_cuda.push_back( SegmentType(&idx0[0], idx0.size(), cuda_res) ); + colorset_cuda.push_back( SegmentType(&idx1[0], idx1.size(), cuda_res) ); + colorset_cuda.push_back( SegmentType(&idx2[0], idx2.size(), cuda_res) ); + colorset_cuda.push_back( SegmentType(&idx3[0], idx3.size(), cuda_res) ); + std::memset(vertexvol, 0, N_vert*N_vert * sizeof(double)); // _raja_cuda_colorindexset_vertexsum_start using EXEC_POL4 = RAJA::ExecPolicy>; - RAJA::forall(colorset, [=] RAJA_DEVICE (int ie) { + RAJA::forall(colorset_cuda, [=] RAJA_DEVICE (int ie) { int* iv = &(elem2vert_map[4*ie]); vertexvol[ iv[0] ] += elemvol[ie] / 4.0 ; vertexvol[ iv[1] ] += elemvol[ie] / 4.0 ; @@ -347,10 +365,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(vertexvol, 0, N_vert*N_vert * sizeof(double)); hipMemcpy(d_vertexvol, vertexvol, N_vert*N_vert*sizeof(double), hipMemcpyHostToDevice); +// +// We create a RAJA TypedIndexSet with four ListSegments as before, +// but now we use a Hip resource so the segment indices live in +// device memory. +// + RAJA::TypedIndexSet colorset_hip; + + camp::resources::Resource hip_res{camp::resources::Hip()}; + + colorset_hip.push_back( SegmentType(&idx0[0], idx0.size(), hip_res) ); + colorset_hip.push_back( SegmentType(&idx1[0], idx1.size(), hip_res) ); + colorset_hip.push_back( SegmentType(&idx2[0], idx2.size(), hip_res) ); + colorset_hip.push_back( SegmentType(&idx3[0], idx3.size(), hip_res) ); + using EXEC_POL4 = RAJA::ExecPolicy>; - RAJA::forall(colorset, [=] RAJA_DEVICE (int ie) { + RAJA::forall(colorset_hip, [=] RAJA_DEVICE (int ie) { int* iv = &(d_elem2vert_map[4*ie]); d_vertexvol[ iv[0] ] += d_elemvol[ie] / 4.0 ; d_vertexvol[ iv[1] ] += d_elemvol[ie] / 4.0 ; diff --git a/examples/wave-eqn.cpp b/examples/wave-eqn.cpp index dcc651d048..f1345335c5 100644 --- a/examples/wave-eqn.cpp +++ b/examples/wave-eqn.cpp @@ -128,15 +128,25 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >; // OpenMP policy - // using fdPolicy = RAJA::KernelPolicy< - // RAJA::statement::For<0, RAJA::omp_parallel_for_exec >, - // RAJA::statement::For<1, RAJA::seq_exec > >; + //using fdPolicy = RAJA::KernelPolicy< + //RAJA::statement::For<1, RAJA::omp_parallel_for_exec, + // RAJA::statement::For<0, RAJA::loop_exec, RAJA::statement::Lambda<0> > > >; // CUDA policy - // using fdPolicy = RAJA::KernelPolicy< - // RAJA::statement::CudaCollapse< - // RAJA::statement::For<0, RAJA::cuda_threadblock_x_exec<16> >, - // RAJA::statement::For<1, RAJA::cuda_threadblock_y_exec<16> > > >; + //using fdPolicy = + //RAJA::KernelPolicy< + // RAJA::statement::CudaKernel< + // RAJA::statement::Tile<1, RAJA::tile_fixed<16>, RAJA::cuda_block_y_direct, + // RAJA::statement::Tile<0, RAJA::tile_fixed<16>, RAJA::cuda_block_x_direct, + // RAJA::statement::For<1, RAJA::cuda_thread_y_direct, + // RAJA::statement::For<0, RAJA::cuda_thread_x_direct, + // RAJA::statement::Lambda<0> + // > + // > + // > + // > + // > + // >; time = 0; @@ -182,8 +192,8 @@ void computeErr(double *P, double tf, grid_s grid) RAJA::ReduceMax tMax(-1.0); using initialPolicy = RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::loop_exec >, - RAJA::statement::For<0, RAJA::loop_exec, RAJA::statement::Lambda<0>> >; + RAJA::statement::For<1, RAJA::loop_exec , + RAJA::statement::For<0, RAJA::loop_exec, RAJA::statement::Lambda<0> > > >; RAJA::kernel(RAJA::make_tuple(fdBounds,fdBounds), [=] (RAJA::Index_type tx, RAJA::Index_type ty) { @@ -213,8 +223,8 @@ void setIC(double *P1, double *P2, double t0, double t1, grid_s grid) RAJA::RangeSegment fdBounds(0, grid.nx); using initialPolicy = RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::loop_exec >, - RAJA::statement::For<0, RAJA::loop_exec, RAJA::statement::Lambda<0>> >; + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::For<0, RAJA::loop_exec, RAJA::statement::Lambda<0>> > >; RAJA::kernel(RAJA::make_tuple(fdBounds,fdBounds), [=] (RAJA::Index_type tx, RAJA::Index_type ty) { diff --git a/exercises/tutorial_halfday/ex3_colored-indexset.cpp b/exercises/tutorial_halfday/ex3_colored-indexset.cpp index 0f3fdcbf85..0d370fcfe9 100644 --- a/exercises/tutorial_halfday/ex3_colored-indexset.cpp +++ b/exercises/tutorial_halfday/ex3_colored-indexset.cpp @@ -13,6 +13,8 @@ #include "RAJA/RAJA.hpp" +#include "camp/resource.hpp" + #include "memoryManager.hpp" /* @@ -213,23 +215,33 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif - -// -// Create a RAJA TypedIndexSet with four ListSegments, one for the indices of -// the elements in each subset. This will be used in the RAJA OpenMP and CUDA -// variants of the vertex sum calculation. -// + // The TypedIndexSet is a variadic template, where the template arguments -// are the segment types that the TypedIndexSet can hold. -// +// are the segment types that the TypedIndexSet can hold. +// + using SegmentType = RAJA::TypedListSegment; +#if defined(RAJA_ENABLE_OPENMP) + +// +// Resource object used to construct list segment objects with indices +// living in host (CPU) memory. +// + camp::resources::Resource host_res{camp::resources::Host()}; + +// +// Create a RAJA TypedIndexSet with four ListSegments, one for the indices of +// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA +// variants of the vertex sum calculation. + + RAJA::TypedIndexSet colorset; /// /// TODO... /// - /// EXERCISE: Create a RAJA::TypedIndexSet object that holds four - /// RAJA::TypedListSegment objects, one for each of the - /// 'idx' arrays above. + /// EXERCISE: Add four SegmentType objects to the coloret, one for each of + /// the 'idx' arrays above. Remember to pass the 'host_res' + /// object to the SegmentType constructor. /// @@ -238,8 +250,6 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // over segments, OpenMP parallel iteration of each segment) //----------------------------------------------------------------------------// -#if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running RAJA OpenMP index set vertex sum...\n"; std::memset(areav, 0, Nvert*Nvert * sizeof(double)); @@ -272,6 +282,36 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_CUDA) +// +// Resource object used to construct list segment objects with indices +// living in host (CPU) memory. +// + camp::resources::Resource cuda_res{camp::resources::Cuda()}; + + RAJA::TypedIndexSet cuda_colorset; + + /// + /// TODO... + /// + /// EXERCISE: Add four SegmentType objects to the cuda_coloret, one for + /// each of the 'idx' arrays above. Remember to pass the 'cuda_res' + /// object to the SegmentType constructor. + /// + + + /// + /// TODO... + /// + /// EXERCISE: Implement the vertex sum kernel a RAJA::forall + /// method with execution policy type + /// + /// RAJA::ExecPolicy> + /// + /// so that the kernel iterates over the segments sequentially + /// and executes each segment in parallel as a CUDA kernel. + + std::cout << "\n Running RAJA CUDA index set vertex sum...\n"; std::memset(areav, 0, Nvert*Nvert * sizeof(double)); diff --git a/exercises/tutorial_halfday/ex3_colored-indexset_solution.cpp b/exercises/tutorial_halfday/ex3_colored-indexset_solution.cpp index ffb476d521..1e3d364b58 100644 --- a/exercises/tutorial_halfday/ex3_colored-indexset_solution.cpp +++ b/exercises/tutorial_halfday/ex3_colored-indexset_solution.cpp @@ -13,6 +13,8 @@ #include "RAJA/RAJA.hpp" +#include "camp/resource.hpp" + #include "memoryManager.hpp" /* @@ -210,31 +212,37 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif + +// The TypedIndexSet is a variadic template, where the template arguments +// are the segment types that the TypedIndexSet can hold. +// + using SegmentType = RAJA::TypedListSegment; +#if defined(RAJA_ENABLE_OPENMP) + +// +// Resource object used to construct list segment objects with indices +// living in host (CPU) memory. +// + camp::resources::Resource host_res{camp::resources::Host()}; + // // Create a RAJA TypedIndexSet with four ListSegments, one for the indices of // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA // variants of the vertex sum calculation. -// -// The TypedIndexSet is a variadic template, where the template arguments -// are the segment types that the TypedIndexSet can hold. -// - using SegmentType = RAJA::TypedListSegment; RAJA::TypedIndexSet colorset; - colorset.push_back( SegmentType(&idx[0][0], idx[0].size()) ); - colorset.push_back( SegmentType(&idx[1][0], idx[1].size()) ); - colorset.push_back( SegmentType(&idx[2][0], idx[2].size()) ); - colorset.push_back( SegmentType(&idx[3][0], idx[3].size()) ); + colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), host_res) ); + colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), host_res) ); + colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), host_res) ); + colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), host_res) ); //----------------------------------------------------------------------------// // RAJA OpenMP vertex sum calculation using TypedIndexSet (sequential iteration // over segments, OpenMP parallel iteration of each segment) //----------------------------------------------------------------------------// -#if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running RAJA OpenMP index set vertex sum...\n"; std::memset(areav, 0, Nvert*Nvert * sizeof(double)); @@ -264,6 +272,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_CUDA) +// +// Resource object used to construct list segment objects with indices +// living in host (CPU) memory. +// + camp::resources::Resource cuda_res{camp::resources::Cuda()}; + +// +// Create a RAJA TypedIndexSet with four ListSegments, one for the indices of +// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA +// variants of the vertex sum calculation. + + RAJA::TypedIndexSet cuda_colorset; + + cuda_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), cuda_res) ); + cuda_colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), cuda_res) ); + cuda_colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), cuda_res) ); + cuda_colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), cuda_res) ); + std::cout << "\n Running RAJA CUDA index set vertex sum...\n"; std::memset(areav, 0, Nvert*Nvert * sizeof(double)); @@ -271,7 +297,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using EXEC_POL4 = RAJA::ExecPolicy>; - RAJA::forall(colorset, [=] RAJA_DEVICE (int ie) { + RAJA::forall(cuda_colorset, [=] RAJA_DEVICE (int ie) { int* iv = &(e2v_map[4*ie]); areav[ iv[0] ] += areae[ie] / 4.0 ; areav[ iv[1] ] += areae[ie] / 4.0 ; diff --git a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp index 51edcbaa5f..f3026c9f61 100644 --- a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp +++ b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp @@ -166,9 +166,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if 0 using KERNEL_EXEC_POL_SEQ = RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, // Fill in inner loop execution statements.... RAJA::statement::Lambda<0> @@ -205,9 +205,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if 0 using KERNEL_EXEC_POL_OMP = RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, // Fill in inner loop execution statements.... RAJA::statement::Lambda<0> @@ -246,9 +246,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // using KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::Collapse, @@ -280,9 +280,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using KERNEL_EXEC_POL_CUDA = RAJA::KernelPolicy< RAJA::statement::CudaKernel< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_loop, // Fill in inner loop execution statements.... RAJA::statement::Lambda<0> diff --git a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp index f26ea27775..c6495ee98a 100644 --- a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp +++ b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp @@ -164,9 +164,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using KERNEL_EXEC_POL_SEQ = RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, @@ -198,9 +198,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using KERNEL_EXEC_POL_OMP = RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::For<1, RAJA::omp_parallel_for_exec, RAJA::statement::For<0, RAJA::seq_exec, @@ -236,9 +236,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::Collapse, @@ -269,9 +269,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using KERNEL_EXEC_POL_CUDA = RAJA::KernelPolicy< RAJA::statement::CudaKernel< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_loop, RAJA::statement::For<1, RAJA::cuda_thread_y_direct, RAJA::statement::For<0, RAJA::cuda_thread_x_direct, diff --git a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp index 884c207bcd..1b9f5ccd1a 100644 --- a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp +++ b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp @@ -117,8 +117,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int brow = 0; brow < outer_Dimr; ++brow) { + for (int bcol = 0; bcol < outer_Dimc; ++bcol) { // Stack-allocated local array for data on a tile int Tile[TILE_SZ][TILE_SZ]; @@ -132,8 +132,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int trow = 0; trow < TILE_SZ; ++trow) { for (int tcol = 0; tcol < TILE_SZ; ++tcol) { - int col = bx * TILE_SZ + tcol; // Matrix column index - int row = by * TILE_SZ + trow; // Matrix row index + int col = bcol * TILE_SZ + tcol; // Matrix column index + int row = brow * TILE_SZ + trow; // Matrix row index // Bounds check if (row < N_r && col < N_c) { @@ -151,8 +151,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int tcol = 0; tcol < TILE_SZ; ++tcol) { for (int trow = 0; trow < TILE_SZ; ++trow) { - int col = bx * TILE_SZ + tcol; // Matrix column index - int row = by * TILE_SZ + trow; // Matrix row index + int col = bcol * TILE_SZ + tcol; // Matrix column index + int row = brow * TILE_SZ + trow; // Matrix row index // Bounds check if (row < N_r && col < N_c) { @@ -209,17 +209,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::statement::InitLocalMem, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, + RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::loop_exec, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, + RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::loop_exec, RAJA::statement::Lambda<0> > >, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, + RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::loop_exec, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, + RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::loop_exec, RAJA::statement::Lambda<1> > @@ -247,7 +247,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::make_tuple((int)0, (int)0, RAJA_Tile), - [=](int col, int row, int tcol, int trow, TILE_MEM &RAJA_Tile) { + [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) { RAJA_Tile(trow, tcol) = Aview(row, col); @@ -276,17 +276,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::statement::InitLocalMem, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, + RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::loop_exec, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, + RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::loop_exec, RAJA::statement::Lambda<0> > >, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, + RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::loop_exec, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, + RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::loop_exec, RAJA::statement::Lambda<1> > @@ -315,7 +315,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Fill in lambda expression to write input matrix entry // to local tile array. - [=](int col, int row, int tcol, int trow, TILE_MEM &RAJA_Tile) { + [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) { Atview(col, row) = RAJA_Tile(trow, tcol); @@ -345,9 +345,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::statement::InitLocalMem, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, + RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, + RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<0> > @@ -355,9 +355,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::statement::CudaSyncThreads, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, + RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, + RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<1> > @@ -386,7 +386,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::make_tuple((int)0, (int)0, RAJA_Tile), - [=] RAJA_DEVICE(int col, int row, int tcol, int trow, TILE_MEM &RAJA_Tile) { + [=] RAJA_DEVICE(int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) { RAJA_Tile(trow, tcol) = Aview(row, col); diff --git a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp index fac8506606..2f2c1733d7 100644 --- a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp +++ b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp @@ -117,8 +117,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int brow = 0; brow < outer_Dimr; ++brow) { + for (int bcol = 0; bcol < outer_Dimc; ++bcol) { // Stack-allocated local array for data on a tile int Tile[TILE_SZ][TILE_SZ]; @@ -132,8 +132,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int trow = 0; trow < TILE_SZ; ++trow) { for (int tcol = 0; tcol < TILE_SZ; ++tcol) { - int col = bx * TILE_SZ + tcol; // Matrix column index - int row = by * TILE_SZ + trow; // Matrix row index + int col = bcol * TILE_SZ + tcol; // Matrix column index + int row = brow * TILE_SZ + trow; // Matrix row index // Bounds check if (row < N_r && col < N_c) { @@ -151,8 +151,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int tcol = 0; tcol < TILE_SZ; ++tcol) { for (int trow = 0; trow < TILE_SZ; ++trow) { - int col = bx * TILE_SZ + tcol; // Matrix column index - int row = by * TILE_SZ + trow; // Matrix row index + int col = bcol * TILE_SZ + tcol; // Matrix column index + int row = brow * TILE_SZ + trow; // Matrix row index // Bounds check if (row < N_r && col < N_c) { @@ -203,24 +203,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using SEQ_EXEC_POL = RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::loop_exec, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::loop_exec, RAJA::statement::InitLocalMem, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, + RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::loop_exec, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, + RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::loop_exec, RAJA::statement::Lambda<0> > >, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, + RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::loop_exec, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, + RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::loop_exec, RAJA::statement::Lambda<1> > @@ -235,13 +235,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::make_tuple((int)0, (int)0, RAJA_Tile), - [=](int col, int row, int tcol, int trow, TILE_MEM &RAJA_Tile) { + [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) { RAJA_Tile(trow, tcol) = Aview(row, col); }, - [=](int col, int row, int tcol, int trow, TILE_MEM &RAJA_Tile) { + [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) { Atview(col, row) = RAJA_Tile(trow, tcol); @@ -259,24 +259,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using OPENMP_EXEC_POL = RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::omp_parallel_for_exec, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::loop_exec, RAJA::statement::InitLocalMem, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, + RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::loop_exec, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, + RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::loop_exec, RAJA::statement::Lambda<0> > >, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, + RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::loop_exec, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, + RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::loop_exec, RAJA::statement::Lambda<1> > @@ -290,13 +290,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::make_tuple((int)0, (int)0, RAJA_Tile), - [=](int col, int row, int tcol, int trow, TILE_MEM &RAJA_Tile) { + [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) { RAJA_Tile(trow, tcol) = Aview(row, col); }, - [=](int col, int row, int tcol, int trow, TILE_MEM &RAJA_Tile) { + [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) { Atview(col, row) = RAJA_Tile(trow, tcol); @@ -317,16 +317,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using CUDA_EXEC_POL = RAJA::KernelPolicy< RAJA::statement::CudaKernel< - RAJA::statement::Tile<1, RAJA::statement::tile_fixed, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::statement::tile_fixed, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_loop, RAJA::statement::InitLocalMem, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, + RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, + RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<0> > @@ -334,9 +334,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::statement::CudaSyncThreads, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, + RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, + RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<1> > @@ -354,13 +354,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::make_tuple((int)0, (int)0, RAJA_Tile), - [=] RAJA_DEVICE (int col, int row, int tcol, int trow, TILE_MEM &RAJA_Tile) { + [=] RAJA_DEVICE (int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) { RAJA_Tile(trow, tcol) = Aview(row, col); }, - [=] RAJA_DEVICE(int col, int row, int tcol, int trow, TILE_MEM &RAJA_Tile) { + [=] RAJA_DEVICE(int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) { Atview(col, row) = RAJA_Tile(trow, tcol); diff --git a/host-configs/lassen-blueos_3_ppc64le_ib_p9-clang@8.0.1-cuda.cmake b/host-configs/lassen-blueos_3_ppc64le_ib_p9-clang@8.0.1-cuda.cmake new file mode 100644 index 0000000000..dcb5a731d4 --- /dev/null +++ b/host-configs/lassen-blueos_3_ppc64le_ib_p9-clang@8.0.1-cuda.cmake @@ -0,0 +1,58 @@ +################### +# Generated host-config - Edit at own risk! +################### +# Copyright (c) 2020, Lawrence Livermore National Security, LLC and +# other Umpire Project Developers. See the top-level LICENSE file for +# details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +################### + +#------------------ +# SYS_TYPE: blueos_3_ppc64le_ib_p9 +# Compiler Spec: clang@8.0.1 +# CMake executable path: /usr/tce/packages/cmake/cmake-3.14.5/bin/cmake +#------------------ + +#------------------ +# Compilers +#------------------ + +set(CMAKE_C_COMPILER "/usr/tce/packages/clang/clang-8.0.1/bin/clang" CACHE PATH "") + +set(CMAKE_CXX_COMPILER "/usr/tce/packages/clang/clang-8.0.1/bin/clang++" CACHE PATH "") + +set(CMAKE_CXX_FLAGS_RELEASE "-O3" CACHE STRING "") + +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g" CACHE STRING "") + +set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "") + +#------------------------------------------------------------------------------ +# Cuda +#------------------------------------------------------------------------------ + +set(ENABLE_CUDA ON CACHE BOOL "") + +set(CUDA_TOOLKIT_ROOT_DIR "/usr/tce/packages/cuda/cuda-10.1.243" CACHE PATH "") + +set(CMAKE_CUDA_COMPILER "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc" CACHE PATH "") + +set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -Xcompiler -O3 -Xcompiler -fopenmp" CACHE STRING "") + +set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O3 -g -lineinfo -Xcompiler -O3 -Xcompiler -fopenmp" CACHE STRING "") + +set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g -G" CACHE STRING "") + +#------------------------------------------------------------------------------ +# Other +#------------------------------------------------------------------------------ + +set(RAJA_RANGE_ALIGN "4" CACHE STRING "") + +set(RAJA_RANGE_MIN_LENGTH "32" CACHE STRING "") + +set(RAJA_DATA_ALIGN "64" CACHE STRING "") + +set(RAJA_HOST_CONFIG_LOADED ON CACHE BOOL "") + diff --git a/host-configs/lc-builds/bgqos/clang_4_0_0.cmake b/host-configs/lc-builds/bgqos/clang_4_0_0.cmake index 81a6110098..01a5e3bdf0 100644 --- a/host-configs/lc-builds/bgqos/clang_4_0_0.cmake +++ b/host-configs/lc-builds/bgqos/clang_4_0_0.cmake @@ -19,8 +19,6 @@ set(MPIEXEC_NUMPROC_FLAG "-n" CACHE PATH "") set(ENABLE_WRAP_ALL_TESTS_WITH_MPIEXEC TRUE CACHE BOOL "Ensures that tests will be wrapped with srun to run on the backend nodes") -set(RAJA_RANGE_ALIGN 4 CACHE INT "") -set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "") set(RAJA_DATA_ALIGN 64 CACHE INT "") set(RAJA_HOST_CONFIG_LOADED On CACHE Bool "") diff --git a/host-configs/lc-builds/blueos/clang_X.cmake b/host-configs/lc-builds/blueos/clang_X.cmake index f04bd69264..50e7dbac28 100755 --- a/host-configs/lc-builds/blueos/clang_X.cmake +++ b/host-configs/lc-builds/blueos/clang_X.cmake @@ -11,8 +11,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "-O3" CACHE STRING "") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g" CACHE STRING "") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "") -set(RAJA_RANGE_ALIGN 4 CACHE STRING "") -set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") set(RAJA_DATA_ALIGN 64 CACHE STRING "") set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/lc-builds/blueos/clangcuda_upstream_2018_12_03_nvcc_9_2.cmake b/host-configs/lc-builds/blueos/clangcuda_upstream_2018_12_03_nvcc_9_2.cmake index 5ce2ccbb4f..d3f36540f2 100644 --- a/host-configs/lc-builds/blueos/clangcuda_upstream_2018_12_03_nvcc_9_2.cmake +++ b/host-configs/lc-builds/blueos/clangcuda_upstream_2018_12_03_nvcc_9_2.cmake @@ -16,8 +16,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "-O3" CACHE STRING "") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g" CACHE STRING "") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "") -set(RAJA_RANGE_ALIGN 4 CACHE STRING "") -set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") set(RAJA_DATA_ALIGN 64 CACHE STRING "") set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/lc-builds/blueos/gcc_X.cmake b/host-configs/lc-builds/blueos/gcc_X.cmake index da4d104c04..81c7bcc411 100755 --- a/host-configs/lc-builds/blueos/gcc_X.cmake +++ b/host-configs/lc-builds/blueos/gcc_X.cmake @@ -11,8 +11,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -finline-functions -finline-limit=20000" CAC set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -g -finline-functions -finline-limit=20000" CACHE STRING "") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "") -set(RAJA_RANGE_ALIGN 4 CACHE STRING "") -set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") set(RAJA_DATA_ALIGN 64 CACHE STRING "") set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/lc-builds/blueos/nvcc_clang_X.cmake b/host-configs/lc-builds/blueos/nvcc_clang_X.cmake index e6e5d24182..9c356e1e83 100755 --- a/host-configs/lc-builds/blueos/nvcc_clang_X.cmake +++ b/host-configs/lc-builds/blueos/nvcc_clang_X.cmake @@ -17,8 +17,6 @@ set(CMAKE_CUDA_FLAGS_RELEASE "-O3 ${HOST_OPT_FLAGS}" CACHE STRING "") set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0" CACHE STRING "") set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo -O3 ${HOST_OPT_FLAGS}" CACHE STRING "") -set(RAJA_RANGE_ALIGN 4 CACHE STRING "") -set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") set(RAJA_DATA_ALIGN 64 CACHE STRING "") set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/lc-builds/blueos/nvcc_gcc_X.cmake b/host-configs/lc-builds/blueos/nvcc_gcc_X.cmake index bc99e7ce4c..c2e5948640 100755 --- a/host-configs/lc-builds/blueos/nvcc_gcc_X.cmake +++ b/host-configs/lc-builds/blueos/nvcc_gcc_X.cmake @@ -17,8 +17,6 @@ set(CMAKE_CUDA_FLAGS_RELEASE "-O3 ${HOST_OPT_FLAGS}" CACHE STRING "") set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0" CACHE STRING "") set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo -O3 ${HOST_OPT_FLAGS}" CACHE STRING "") -set(RAJA_RANGE_ALIGN 4 CACHE STRING "") -set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") set(RAJA_DATA_ALIGN 64 CACHE STRING "") set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/lc-builds/blueos/nvcc_xl_2019_X.cmake b/host-configs/lc-builds/blueos/nvcc_xl_X.cmake similarity index 93% rename from host-configs/lc-builds/blueos/nvcc_xl_2019_X.cmake rename to host-configs/lc-builds/blueos/nvcc_xl_X.cmake index 2eebc6091e..8b6662a862 100755 --- a/host-configs/lc-builds/blueos/nvcc_xl_2019_X.cmake +++ b/host-configs/lc-builds/blueos/nvcc_xl_X.cmake @@ -23,8 +23,6 @@ set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo -O3 ${HOST_OPT_FLAGS}" CACHE S # - 1500-036 nostrict optimizations may alter code semantics # (can be countered with -qstrict, with less optimization) -set(RAJA_RANGE_ALIGN 4 CACHE STRING "") -set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") set(RAJA_DATA_ALIGN 64 CACHE STRING "") set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/lc-builds/blueos/pgi_X.cmake b/host-configs/lc-builds/blueos/pgi_X.cmake index 253135fa71..f746940489 100755 --- a/host-configs/lc-builds/blueos/pgi_X.cmake +++ b/host-configs/lc-builds/blueos/pgi_X.cmake @@ -11,8 +11,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fast -mp" CACHE STRING "") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-fast -g -mp" CACHE STRING "") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -mp" CACHE STRING "") -set(RAJA_RANGE_ALIGN 4 CACHE STRING "") -set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") set(RAJA_DATA_ALIGN 64 CACHE STRING "") set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/lc-builds/blueos/xl_2019_X.cmake b/host-configs/lc-builds/blueos/xl_2020_X.cmake similarity index 92% rename from host-configs/lc-builds/blueos/xl_2019_X.cmake rename to host-configs/lc-builds/blueos/xl_2020_X.cmake index 4973bbb431..c04f835145 100755 --- a/host-configs/lc-builds/blueos/xl_2019_X.cmake +++ b/host-configs/lc-builds/blueos/xl_2020_X.cmake @@ -17,8 +17,6 @@ set(CMAKE_EXE_LINKER_FLAGS "-Wl,-z,muldefs" CACHE STRING "") # - 1500-036 nostrict optimizations may alter code semantics # (can be countered with -qstrict, with less optimization) -set(RAJA_RANGE_ALIGN 4 CACHE STRING "") -set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") set(RAJA_DATA_ALIGN 64 CACHE STRING "") set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/lc-builds/blueos/xl_X.cmake b/host-configs/lc-builds/blueos/xl_X.cmake new file mode 100755 index 0000000000..c04f835145 --- /dev/null +++ b/host-configs/lc-builds/blueos/xl_X.cmake @@ -0,0 +1,23 @@ +############################################################################### +# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +set(RAJA_COMPILER "RAJA_COMPILER_XLC" CACHE STRING "") + +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036" CACHE STRING "") +set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -qsmp=omp:noopt " CACHE STRING "") +set(CMAKE_EXE_LINKER_FLAGS "-Wl,-z,muldefs" CACHE STRING "") + +# Suppressed XLC warnings: +# - 1500-029 cannot inline +# - 1500-036 nostrict optimizations may alter code semantics +# (can be countered with -qstrict, with less optimization) + +set(RAJA_DATA_ALIGN 64 CACHE STRING "") + +set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") + diff --git a/host-configs/lc-builds/toss3/clang_X.cmake b/host-configs/lc-builds/toss3/clang_X.cmake index beedc17ea6..506bce066d 100755 --- a/host-configs/lc-builds/toss3/clang_X.cmake +++ b/host-configs/lc-builds/toss3/clang_X.cmake @@ -11,8 +11,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "-O3 -msse4.2 -funroll-loops -finline-functions" CAC set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -msse4.2 -funroll-loops -finline-functions" CACHE STRING "") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "") -set(RAJA_RANGE_ALIGN 4 CACHE STRING "") -set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") set(RAJA_DATA_ALIGN 64 CACHE STRING "") set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/lc-builds/toss3/clangcuda_6_0_0_nvcc_8_0.cmake b/host-configs/lc-builds/toss3/clangcuda_6_0_0_nvcc_8_0.cmake index b769677d16..56f0ba9320 100644 --- a/host-configs/lc-builds/toss3/clangcuda_6_0_0_nvcc_8_0.cmake +++ b/host-configs/lc-builds/toss3/clangcuda_6_0_0_nvcc_8_0.cmake @@ -16,8 +16,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "-O3 -msse4.2 -funroll-loops -finline-functions" CAC set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -msse4.2 -funroll-loops -finline-functions" CACHE STRING "") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "") -set(RAJA_RANGE_ALIGN 4 CACHE STRING "") -set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") set(RAJA_DATA_ALIGN 64 CACHE INT "") set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/lc-builds/toss3/gcc_X.cmake b/host-configs/lc-builds/toss3/gcc_X.cmake index da4d104c04..81c7bcc411 100755 --- a/host-configs/lc-builds/toss3/gcc_X.cmake +++ b/host-configs/lc-builds/toss3/gcc_X.cmake @@ -11,8 +11,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -finline-functions -finline-limit=20000" CAC set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -g -finline-functions -finline-limit=20000" CACHE STRING "") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "") -set(RAJA_RANGE_ALIGN 4 CACHE STRING "") -set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") set(RAJA_DATA_ALIGN 64 CACHE STRING "") set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/lc-builds/toss3/hip.cmake b/host-configs/lc-builds/toss3/hip.cmake new file mode 100644 index 0000000000..e0de15ac9b --- /dev/null +++ b/host-configs/lc-builds/toss3/hip.cmake @@ -0,0 +1,28 @@ +############################################################################### +# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +set(RAJA_COMPILER "RAJA_COMPILER_CLANG" CACHE STRING "") + +set(CMAKE_CXX_FLAGS_RELEASE "-O2" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g" CACHE STRING "") +set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "") + +set(HIP_COMMON_OPT_FLAGS ) +set(HIP_COMMON_DEBUG_FLAGS) +set(HOST_OPT_FLAGS) + +if(CMAKE_BUILD_TYPE MATCHES Release) + set(RAJA_HIPCC_FLAGS "-fPIC -O2 ${HIP_COMMON_OPT_FLAGS} ${HOST_OPT_FLAGS}" CACHE STRING "") +elseif(CMAKE_BUILD_TYPE MATCHES RelWithDebInfo) + set(RAJA_HIPCC_FLAGS "-fPIC -g -O2 ${HIP_COMMON_OPT_FLAGS} ${HOST_OPT_FLAGS}" CACHE STRING "") +elseif(CMAKE_BUILD_TYPE MATCHES Debug) + set(RAJA_HIPCC_FLAGS "-fPIC -g -O0 ${HIP_COMMON_DEBUG_FLAGS}" CACHE STRING "") +endif() + +set(RAJA_DATA_ALIGN 64 CACHE STRING "") + +set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/lc-builds/toss3/icpc_X_gcc7headers.cmake b/host-configs/lc-builds/toss3/icpc_X_gcc7headers.cmake index 877d4f3189..3e7e3a7675 100755 --- a/host-configs/lc-builds/toss3/icpc_X_gcc7headers.cmake +++ b/host-configs/lc-builds/toss3/icpc_X_gcc7headers.cmake @@ -13,8 +13,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} -O3 -fp-model source -unroll-aggres set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g -fp-model source -unroll-aggressive -finline-functions -axCORE-AVX2 -diag-disable cpu-dispatch" CACHE STRING "") set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} -O0 -g" CACHE STRING "") -set(RAJA_RANGE_ALIGN 4 CACHE STRING "") -set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") set(RAJA_DATA_ALIGN 64 CACHE STRING "") set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake b/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake index 46b1ac878f..ae34c4e6da 100755 --- a/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake +++ b/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake @@ -13,8 +13,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} -O3 -march=native -ansi-alias -axCO set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g -march=native -ansi-alias -axCORE-AVX2 -diag-disable cpu-dispatch" CACHE STRING "") set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} -O0 -g" CACHE STRING "") -set(RAJA_RANGE_ALIGN 4 CACHE STRING "") -set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") set(RAJA_DATA_ALIGN 64 CACHE STRING "") set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/lc-builds/toss3/pgi_X.cmake b/host-configs/lc-builds/toss3/pgi_X.cmake index 7a8f29d98f..a54a39c0e8 100755 --- a/host-configs/lc-builds/toss3/pgi_X.cmake +++ b/host-configs/lc-builds/toss3/pgi_X.cmake @@ -11,8 +11,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fast -mp" CACHE STRING "") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -fast -mp" CACHE STRING "") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -mp" CACHE STRING "") -set(RAJA_RANGE_ALIGN 4 CACHE STRING "") -set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") set(RAJA_DATA_ALIGN 64 CACHE STRING "") set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/macos-builds/clang_X.cmake b/host-configs/macos-builds/clang_X.cmake new file mode 100755 index 0000000000..f10479f54d --- /dev/null +++ b/host-configs/macos-builds/clang_X.cmake @@ -0,0 +1,18 @@ +############################################################################### +# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +set(RAJA_COMPILER "RAJA_COMPILER_CLANG" CACHE STRING "") + +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -funroll-loops -finline-functions" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -march=native -funroll-loops -finline-functions" CACHE STRING "") +set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "") + +set(RAJA_RANGE_ALIGN 4 CACHE STRING "") +set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") +set(RAJA_DATA_ALIGN 64 CACHE STRING "") + +set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/quartz-toss_3_x86_64_ib-clang@9.0.0.cmake b/host-configs/quartz-toss_3_x86_64_ib-clang@9.0.0.cmake new file mode 100644 index 0000000000..3af54ba3b8 --- /dev/null +++ b/host-configs/quartz-toss_3_x86_64_ib-clang@9.0.0.cmake @@ -0,0 +1,44 @@ +################### +# Generated host-config - Edit at own risk! +################### +# Copyright (c) 2020, Lawrence Livermore National Security, LLC and +# other Umpire Project Developers. See the top-level LICENSE file for +# details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +################### + +#------------------ +# SYS_TYPE: toss_3_x86_64_ib +# Compiler Spec: clang@9.0.0 +# CMake executable path: /usr/tce/packages/cmake/cmake-3.14.5/bin/cmake +#------------------ + +#------------------ +# Compilers +#------------------ + +set(CMAKE_C_COMPILER "/usr/tce/packages/clang/clang-9.0.0/bin/clang" CACHE PATH "") + +set(CMAKE_CXX_COMPILER "/usr/tce/packages/clang/clang-9.0.0/bin/clang++" CACHE PATH "") + +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -msse4.2 -funroll-loops -finline-functions" CACHE STRING "") + +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -msse4.2 -funroll-loops -finline-functions" CACHE STRING "") + +set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "") + +set(ENABLE_CUDA OFF CACHE BOOL "") + +#------------------------------------------------------------------------------ +# Other +#------------------------------------------------------------------------------ + +set(RAJA_RANGE_ALIGN "4" CACHE STRING "") + +set(RAJA_RANGE_MIN_LENGTH "32" CACHE STRING "") + +set(RAJA_DATA_ALIGN "64" CACHE STRING "") + +set(RAJA_HOST_CONFIG_LOADED ON CACHE BOOL "") + diff --git a/host-configs/ubuntu-builds/clang_X.cmake b/host-configs/ubuntu-builds/clang_X.cmake new file mode 100644 index 0000000000..beedc17ea6 --- /dev/null +++ b/host-configs/ubuntu-builds/clang_X.cmake @@ -0,0 +1,18 @@ +############################################################################### +# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +set(RAJA_COMPILER "RAJA_COMPILER_CLANG" CACHE STRING "") + +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -msse4.2 -funroll-loops -finline-functions" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -msse4.2 -funroll-loops -finline-functions" CACHE STRING "") +set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "") + +set(RAJA_RANGE_ALIGN 4 CACHE STRING "") +set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") +set(RAJA_DATA_ALIGN 64 CACHE STRING "") + +set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/ubuntu-builds/gcc_X.cmake b/host-configs/ubuntu-builds/gcc_X.cmake new file mode 100644 index 0000000000..da4d104c04 --- /dev/null +++ b/host-configs/ubuntu-builds/gcc_X.cmake @@ -0,0 +1,18 @@ +############################################################################### +# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +set(RAJA_COMPILER "RAJA_COMPILER_GNU" CACHE STRING "") + +set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -finline-functions -finline-limit=20000" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -g -finline-functions -finline-limit=20000" CACHE STRING "") +set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "") + +set(RAJA_RANGE_ALIGN 4 CACHE STRING "") +set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") +set(RAJA_DATA_ALIGN 64 CACHE STRING "") + +set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/hip.cmake b/host-configs/ubuntu-builds/hip.cmake similarity index 61% rename from host-configs/hip.cmake rename to host-configs/ubuntu-builds/hip.cmake index 8c020ac69b..7109eef7e3 100644 --- a/host-configs/hip.cmake +++ b/host-configs/ubuntu-builds/hip.cmake @@ -11,10 +11,10 @@ set(ENABLE_HIP ON CACHE BOOL "") set(ENABLE_OPENMP OFF CACHE BOOL "") set(ENABLE_CUDA Off CACHE BOOL "") -set(HIP_ROOT_DIR "/opt/rocm/hip" CACHE PATH "HIP ROOT directory path") +set(HIP_ROOT_DIR "${ROCM_DIR}/hip" CACHE PATH "HIP ROOT directory path") -set(CMAKE_CXX_COMPILER "g++" CACHE PATH "") -set(CMAKE_C_COMPILER "gcc" CACHE PATH "") +set(CMAKE_CXX_COMPILER "/usr/bin/g++" CACHE PATH "") +set(CMAKE_C_COMPILER "/usr/bin/gcc" CACHE PATH "") set(CMAKE_CXX_FLAGS_RELEASE "-O2" CACHE STRING "") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g" CACHE STRING "") @@ -30,16 +30,15 @@ if (ENABLE_OPENMP) endif() if(CMAKE_BUILD_TYPE MATCHES Release) - set(RAJA_HIPCC_FLAGS -O2; ${HIP_COMMON_OPT_FLAGS}; ${HOST_OPT_FLAGS} CACHE LIST "") + set(RAJA_HIPCC_FLAGS "-O2 ${HIP_COMMON_OPT_FLAGS} ${HOST_OPT_FLAGS}" CACHE STRING "") elseif(CMAKE_BUILD_TYPE MATCHES RelWithDebInfo) - set(RAJA_HIPCC_FLAGS -g; -G; -O2; ${HIP_COMMON_OPT_FLAGS}; ${HOST_OPT_FLAGS} CACHE LIST "") + set(RAJA_HIPCC_FLAGS "-g -O2 ${HIP_COMMON_OPT_FLAGS} ${HOST_OPT_FLAGS}" CACHE STRING "") elseif(CMAKE_BUILD_TYPE MATCHES Debug) - set(RAJA_HIPCC_FLAGS -g; -G; -O0; ${HIP_COMMON_DEBUG_FLAGS}; CACHE LIST "") + set(RAJA_HIPCC_FLAGS "-g -O0 ${HIP_COMMON_DEBUG_FLAGS}" CACHE STRING "") endif() -set(RAJA_RANGE_ALIGN 4 CACHE INT "") -set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "") -set(RAJA_DATA_ALIGN 64 CACHE INT "") -set(RAJA_COHERENCE_BLOCK_SIZE 64 CACHE INT "") +set(RAJA_RANGE_ALIGN 4 CACHE STRING "") +set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") +set(RAJA_DATA_ALIGN 64 CACHE STRING "") -set(RAJA_HOST_CONFIG_LOADED On CACHE Bool "") +set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/ubuntu-builds/nvcc_gcc_X.cmake b/host-configs/ubuntu-builds/nvcc_gcc_X.cmake new file mode 100644 index 0000000000..bc99e7ce4c --- /dev/null +++ b/host-configs/ubuntu-builds/nvcc_gcc_X.cmake @@ -0,0 +1,24 @@ +############################################################################### +# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +set(RAJA_COMPILER "RAJA_COMPILER_GNU" CACHE STRING "") + +set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -finline-functions" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -g -finline-functions" CACHE STRING "") +set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "") + +set(HOST_OPT_FLAGS "-Xcompiler -O3 -Xcompiler -finline-functions -Xcompiler -fopenmp") + +set(CMAKE_CUDA_FLAGS_RELEASE "-O3 ${HOST_OPT_FLAGS}" CACHE STRING "") +set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0" CACHE STRING "") +set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo -O3 ${HOST_OPT_FLAGS}" CACHE STRING "") + +set(RAJA_RANGE_ALIGN 4 CACHE STRING "") +set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") +set(RAJA_DATA_ALIGN 64 CACHE STRING "") + +set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp index e0ef93d556..ea04033775 100644 --- a/include/RAJA/RAJA.hpp +++ b/include/RAJA/RAJA.hpp @@ -43,6 +43,7 @@ // #include "RAJA/pattern/forall.hpp" #include "RAJA/pattern/kernel.hpp" +#include "RAJA/pattern/teams.hpp" // @@ -108,6 +109,16 @@ #include "RAJA/util/View.hpp" +// +// View for sequences of objects +// +#include "RAJA/util/Span.hpp" + +// +// zip iterator to iterator over sequences simultaneously +// +#include "RAJA/util/zip.hpp" + // // Atomic operations support // @@ -123,6 +134,17 @@ // #include "RAJA/util/BitMask.hpp" +// +// sort algorithms +// +#include "RAJA/util/sort.hpp" + +// +// WorkPool, WorkGroup, WorkSite objects +// +#include "RAJA/policy/WorkGroup.hpp" +#include "RAJA/pattern/WorkGroup.hpp" + // // Reduction objects // @@ -148,7 +170,12 @@ // #include "RAJA/index/IndexSetUtils.hpp" +#include "RAJA/index/IndexSetBuilders.hpp" #include "RAJA/pattern/scan.hpp" +#include "RAJA/util/PluginLinker.hpp" + +#include "RAJA/pattern/sort.hpp" + #endif // closing endif for header file include guard diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in index a1f7847fed..91dc8c56a8 100644 --- a/include/RAJA/config.hpp.in +++ b/include/RAJA/config.hpp.in @@ -39,7 +39,7 @@ ****************************************************************************** */ #cmakedefine ENABLE_FT - +#cmakedefine ENABLE_ITERATOR_OVERFLOW_DEBUG /*! ****************************************************************************** * @@ -77,10 +77,19 @@ */ #cmakedefine RAJA_ENABLE_BOUNDS_CHECK +/* + ****************************************************************************** + * + * \brief Exhaustive index types for tests + * + ****************************************************************************** + */ +#cmakedefine RAJA_TEST_EXHAUSTIVE + /*! ****************************************************************************** * - * \brief Programming model back-ends, plus CHAI enable/disable. + * \brief Programming model back-ends. * ****************************************************************************** */ @@ -91,6 +100,8 @@ #cmakedefine RAJA_ENABLE_CLANG_CUDA #cmakedefine RAJA_ENABLE_HIP +#cmakedefine RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL + /*! ****************************************************************************** * @@ -135,11 +146,21 @@ namespace RAJA { #if defined(RAJA_ENABLE_OPENMP) -#if not defined(_OPENMP) -#error RAJA configured with ENABLE_OPENMP, but OpenMP not supported by current compiler +#if defined(_OPENMP) +#if _OPENMP >= 200805 +#define RAJA_ENABLE_OPENMP_TASK +#endif +#else +#error RAJA configured with ENABLE_OPENMP, but OpenMP not supported by current compiler #endif // _OPENMP #endif // RAJA_ENABLE_OPENMP +#if defined(RAJA_ENABLE_CUDA) +#if not defined(__CUDACC__) +#error RAJA configured with ENABLE_CUDA, but CUDA not supported by current compiler +#endif // +#endif // RAJA_ENABLE_CUDA + /*! ****************************************************************************** @@ -162,23 +183,11 @@ namespace RAJA { */ // -// Platform-specific constants for range index set and data alignment: -// -// RANGE_ALIGN - alignment of begin/end indices in range segments -// (i.e., starting index and length of range segments -// constructed by index set builder methods will -// be multiples of this value) -// -// RANGE_MIN_LENGTH - used in index set builder methods -// as min length of range segments (an integer multiple -// of RANGE_ALIGN) +// Platform-specific constants for data alignment: // // DATA_ALIGN - used in compiler-specific intrinsics and type aliases // to specify alignment of data, loop bounds, etc.; // units of "bytes" - -const int RANGE_ALIGN = @RAJA_RANGE_ALIGN@; -const int RANGE_MIN_LENGTH = @RAJA_RANGE_MIN_LENGTH@; const int DATA_ALIGN = @RAJA_DATA_ALIGN@; #if defined (_WIN32) @@ -187,32 +196,20 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@; #define RAJA_RESTRICT __restrict__ #endif +#if !defined(RAJA_COMPILER_MSVC) +#define RAJA_COLLAPSE(X) collapse(X) +#else +#define RAJA_COLLAPSE(X) +#endif + // // Runtime bounds checking for Views // #if defined(RAJA_ENABLE_BOUNDS_CHECK) #define RAJA_BOUNDS_CHECK_INTERNAL #define RAJA_BOUNDS_CHECK_constexpr - -#if !defined(NDEBUG) -#define RAJA_ASSERT(EXP) assert( (EXP) ) -#endif - -// -//TODO: Once HIP is supported use asm("s_trap 2"); -// to halt HIP kernels. -// -#if defined(NDEBUG) && defined(__CUDA_ARCH__) -#define RAJA_ASSERT(EXP) asm ("trap;") -#endif - -#if defined(NDEBUG) && !defined(__CUDA_ARCH__) -#define RAJA_ASSERT(EXP) abort(); -#endif - #else #define RAJA_BOUNDS_CHECK_constexpr constexpr -#define RAJA_ASSERT(EXP) #endif // @@ -299,11 +296,11 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@; #if defined(_OPENMP) && (_OPENMP >= 201307) #define RAJA_SIMD RAJA_PRAGMA(omp simd) -#define RAJA_NO_SIMD +#define RAJA_NO_SIMD #elif defined(__GNUC__) && defined(__GNUC_MINOR__) && \ ( ( (__GNUC__ == 4) && (__GNUC_MINOR__ == 9) ) || (__GNUC__ >= 5) ) #define RAJA_SIMD RAJA_PRAGMA(GCC ivdep) -#define RAJA_NO_SIMD +#define RAJA_NO_SIMD #else #define RAJA_SIMD #define RAJA_NO_SIMD @@ -357,13 +354,24 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@; // Apple Clang compiler supports older options #if ( ( (__clang_major__ >= 4 ) || (__clang_major__ >= 3 && __clang_minor__ > 7) ) && !defined(__APPLE__) ) #define RAJA_SIMD RAJA_PRAGMA(clang loop vectorize(assume_safety)) -#else +#else #define RAJA_SIMD RAJA_PRAGMA(clang loop vectorize(enable)) #endif #define RAJA_NO_SIMD RAJA_PRAGMA(clang loop vectorize(disable)) #endif + +// This is the same as undefined compiler, but squelches the warning message +#elif defined(RAJA_COMPILER_MSVC) + +#define RAJA_FORCEINLINE_RECURSIVE +#define RAJA_INLINE inline +#define RAJA_ALIGN_DATA(d) d +#define RAJA_SIMD +#define RAJA_NO_SIMD + + #else #pragma message("RAJA_COMPILER unknown, using default empty macros.") @@ -400,7 +408,7 @@ T * align_hint(T * x) return static_cast(RAJA_ALIGN_DATA(x)); #endif } - + } // closing brace for RAJA namespace #endif // closing endif for header file include guard diff --git a/include/RAJA/index/IndexSet.hpp b/include/RAJA/index/IndexSet.hpp index 5897edefbf..d95e43a871 100644 --- a/include/RAJA/index/IndexSet.hpp +++ b/include/RAJA/index/IndexSet.hpp @@ -90,7 +90,12 @@ class TypedIndexSet : public TypedIndexSet "All segments must have the same value_type"); //! Construct empty index set +#if _MSC_VER < 1910 + // this one instance of constexpr does not work on VS2012 or VS2015 + RAJA_INLINE TypedIndexSet() : PARENT() {} +#else RAJA_INLINE constexpr TypedIndexSet() : PARENT() {} +#endif //! Copy-constructor for index set RAJA_INLINE @@ -232,12 +237,15 @@ class TypedIndexSet : public TypedIndexSet { Index_type num = getNumSegments(); - RangeStrideSegment Iter = (pend == PUSH_BACK) - ? RangeStrideSegment(0, num, 1) - : RangeStrideSegment(num - 1, -1, -1); - - for (Index_type i : Iter) - segment_push_into(i, c, pend, pcopy); + if (pend == PUSH_BACK) { + for (Index_type i = 0; i < num; ++i) { + segment_push_into(i, c, pend, pcopy); + } + } else { + for (Index_type i = num-1; i > -1; --i) { + segment_push_into(i, c, pend, pcopy); + } + } } diff --git a/include/RAJA/index/IndexSetBuilders.hpp b/include/RAJA/index/IndexSetBuilders.hpp index 1202a1a554..60e8c160e0 100644 --- a/include/RAJA/index/IndexSetBuilders.hpp +++ b/include/RAJA/index/IndexSetBuilders.hpp @@ -21,33 +21,46 @@ #include "RAJA/config.hpp" #include "RAJA/index/IndexSet.hpp" +#include "RAJA/index/ListSegment.hpp" +#include "RAJA/index/RangeSegment.hpp" #include "RAJA/util/types.hpp" +#include "camp/resource.hpp" + namespace RAJA { /*! ****************************************************************************** * - * \brief Initialize index set with aligned Ranges and List segments from - * array of indices with given length. - * - * Specifically, Range segments will be greater than RANGE_MIN_LENGTH - * and starting index and length of each range segment will be - * multiples of RANGE_ALIGN. These constants are defined in the - * RAJA config.hpp header file. + * \brief Generate an index set with aligned Range segments and List segments, + * as needed, from given array of indices. * - * Routine does no error-checking on argements and assumes Index_type - * array contains valid indices. + * Routine does no error-checking on argements and assumes + * RAJA::Index_type array contains valid indices. * - * Note: Method assumes TypedIndexSet reference refers to an empty index set. + * \param iset reference to index set generated with aligned range segments + * and list segments. Method assumes index set is empty (no segments). + * \param work_res camp resource object that identifies the memory space in + * which list segment index data will live (passed to list segment + * ctor). + * \param indices_in pointer to start of input array of indices. + * \param length size of input index array. + * \param range_min_length min length of any range segment in index set + * \param range_align "alignment" value for range segments in index set. + * Starting index each range segment will be a multiple of this value. * ****************************************************************************** */ -void buildTypedIndexSetAligned(IndexSet& hiset, - const Index_type* const indices_in, - Index_type length); +void buildIndexSetAligned( + RAJA::TypedIndexSet& iset, + camp::resources::Resource& work_res, + const RAJA::Index_type* const indices_in, + RAJA::Index_type length, + RAJA::Index_type range_min_length, + RAJA::Index_type range_align); + //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// @@ -62,47 +75,56 @@ void buildTypedIndexSetAligned(IndexSet& hiset, //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// -/* +/*! ****************************************************************************** * - * Initialize lock-free "block" index set (planar division). + * \brief Generate a lock-free "block" index set (planar division) containing + * range segments. * - * The method chunks a fastDim x midDim x slowDim mesh into blocks that can - * be dependency-scheduled, removing need for lock constructs. + * The method chunks a fastDim x midDim x slowDim mesh into blocks that + * can be dependency-scheduled, removing need for lock constructs. * - * Note: Method assumes TypedIndexSet reference refers to an empty index set. + * \param iset reference to index set generated with range segments. + * Method assumes index set is empty (no segments). + * \param fastDim "fast" block dimension (see above). + * \param midDim "mid" block dimension (see above). + * \param slowDim "slow" block dimension (see above). * ****************************************************************************** */ void buildLockFreeBlockIndexset( - RAJA::TypedIndexSet& iset, + RAJA::TypedIndexSet& iset, int fastDim, int midDim, int slowDim); -/* +/*! ****************************************************************************** * - * Build Lock-free "color" index set. The domain-set is colored based on - * connectivity to the range-set. All elements in each segment are - * independent, and no two segments can be executed in parallel. + * \brief Generate a lock-free "color" index set containing range and list + * segments. + * + * TThe domain-set is colored based on connectivity to the range-set. + * All elements in each segment are independent, and no two segments + * can be executed in parallel. * - * Note: Method assumes TypedIndexSet reference refers to an empty index set. + * \param iset reference to index set generated. Method assumes index set + * is empty (no segments). + * \param work_res camp resource object that identifies the memory space in + * which list segment index data will live (passed to list segment + * ctor). * ****************************************************************************** */ void buildLockFreeColorIndexset( - RAJA::TypedIndexSet& iset, - Index_type const* domainToRange, + RAJA::TypedIndexSet& iset, + camp::resources::Resource& work_res, + RAJA::Index_type const* domainToRange, int numEntity, int numRangePerDomain, int numEntityRange, - Index_type* elemPermutation = 0l, - Index_type* ielemPermutation = 0l); + RAJA::Index_type* elemPermutation = nullptr, + RAJA::Index_type* ielemPermutation = nullptr); } // namespace RAJA diff --git a/include/RAJA/index/IndexValue.hpp b/include/RAJA/index/IndexValue.hpp index 2f815969d4..e863978c86 100644 --- a/include/RAJA/index/IndexValue.hpp +++ b/include/RAJA/index/IndexValue.hpp @@ -356,6 +356,19 @@ struct StripIndexTypeT using strip_index_type_t = typename internal::StripIndexTypeT::type; +/*! + * \brief Converts a type into a signed type. Also handles floating point + * types as std::make_signed only supports integral types. + * + * \param FROM the original type + */ +template +using make_signed_t = typename std::conditional < + std::is_floating_point::value, + std::common_type, + std::make_signed + >::type::type; + } // namespace RAJA /*! @@ -381,18 +394,17 @@ using strip_index_type_t = typename internal::StripIndexTypeT::type; /*! * \brief Helper Macro to create new Index types. * \param TYPE the name of the type + * \param IDXT the index types value type * \param NAME a string literal to identify this index type */ #define RAJA_INDEX_VALUE_T(TYPE, IDXT, NAME) \ class TYPE : public ::RAJA::IndexValue \ { \ - using parent = ::RAJA::IndexValue; \ - \ public: \ - using IndexValueType = TYPE; \ - RAJA_HOST_DEVICE RAJA_INLINE TYPE() : parent::IndexValue() {} \ - RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(IDXT v) \ - : parent::IndexValue(v) \ + RAJA_HOST_DEVICE RAJA_INLINE TYPE() \ + : RAJA::IndexValue::IndexValue() {} \ + RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(IDXT v) \ + : RAJA::IndexValue::IndexValue(v) \ { \ } \ static inline std::string getName() { return NAME; } \ diff --git a/include/RAJA/index/ListSegment.hpp b/include/RAJA/index/ListSegment.hpp index 06efda5b33..0f5ad36e2e 100644 --- a/include/RAJA/index/ListSegment.hpp +++ b/include/RAJA/index/ListSegment.hpp @@ -24,10 +24,11 @@ #include #include -#include "RAJA/internal/Span.hpp" +#include "camp/resource.hpp" #include "RAJA/util/concepts.hpp" #include "RAJA/util/macros.hpp" +#include "RAJA/util/Span.hpp" #include "RAJA/util/types.hpp" #if (defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))) && defined(RAJA_ENABLE_CUDA) @@ -61,7 +62,14 @@ namespace RAJA template class TypedListSegment { - +/* + * All of the following down to the 'public' section is original machinery + * to manage segment index data using CUDA or HIP unified memory. Eventually, + * it will be removed, but is left in place for now to preserve original + * behavior so our tests don't need to be reworked en masse now and users + * won't see any different usage or behavior. + */ + #if ((defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))) && defined(RAJA_ENABLE_CUDA)) || defined(RAJA_ENABLE_HIP) static constexpr bool Has_GPU = true; #else @@ -117,6 +125,7 @@ class TypedListSegment cudaErrchk(cudaMemcpy( m_data, &(*src.begin()), m_size * sizeof(T), cudaMemcpyDefault)); } + #elif defined(RAJA_ENABLE_HIP) //! copy data from container using BlockCopy template @@ -166,20 +175,103 @@ class TypedListSegment //! prevent compiler from providing a default constructor TypedListSegment() = delete; +/* + * The following two constructors allow users to specify a camp resource + * for each list segment, which be used to manage segment index data. + * + * Eventually, I think it would be better to add a template parameter for + * this class to specify the camp resource type rather than passing in a + * resource object. + */ + + /// + /// \brief Construct list segment from given array with specified length + /// and use given camp resource to allocate list segment index data + /// if owned by this list segment. + /// + /// By default the ctor performs a deep copy of array elements. + /// + /// If 'Unowned' is passed as last argument, the constructed object + /// does not own the segment data and will hold a pointer to given + /// array's data. In this case, caller must manage object lifetimes properly. + /// + TypedListSegment(const value_type* values, + Index_type length, + camp::resources::Resource& resource, + IndexOwnership owned = Owned) + : m_resource(resource), m_use_resource(true) + { + initIndexData(m_use_resource, + values, length, owned); + } + + /// + /// Construct list segment from arbitrary object holding + /// indices using a deep copy of given data. + /// + /// The object must provide methods: begin(), end(), size(). + /// + template + TypedListSegment(const Container& container, + camp::resources::Resource& resource) + : m_resource(resource), m_use_resource(true), + m_owned(Unowned), m_data(nullptr), m_size(container.size()) + { + + if (m_size > 0) { + + camp::resources::Resource host_res{camp::resources::Host()}; + + value_type* tmp = host_res.allocate(m_size); + + auto dest = tmp; + auto src = container.begin(); + auto const end = container.end(); + while (src != end) { + *dest = *src; + ++dest; + ++src; + } + + m_data = m_resource.allocate(m_size); + m_resource.memcpy(m_data, tmp, sizeof(value_type) * m_size); + m_owned = Owned; + + host_res.deallocate(tmp); + + } + } + + +/* + * The following two ctors preserve the original list segment behavior for + * CUDA and HIP device memory management. + * + * Note that the host resource object created in the member initialization + * list is not used. Where memory management routines are shared between + * the old way and using camp resources are controlled by the m_use_resource + * boolean member. + */ + /// /// \brief Construct list segment from given array with specified length. /// /// By default the ctor performs deep copy of array elements. + /// /// If 'Unowned' is passed as last argument, the constructed object - /// does not own the segment data and will hold a pointer to given data. - /// In this case, caller must manage object lifetimes properly. + /// does not own the segment data and will hold a pointer to given + /// array's data. In this case, caller must manage object lifetimes properly. /// + RAJA_DEPRECATE("In next RAJA release, TypedListSegment ctor will require a camp Resource object") TypedListSegment(const value_type* values, Index_type length, IndexOwnership owned = Owned) + : m_resource(camp::resources::Resource{camp::resources::Host()}), + m_use_resource(false), + m_owned(Unowned), m_data(nullptr), m_size(0) { - // future TODO -- change to initializer list somehow - initIndexData(values, length, owned); + initIndexData(m_use_resource, + values, length, owned); } /// @@ -189,28 +281,36 @@ class TypedListSegment /// The object must provide methods: begin(), end(), size(). /// template + RAJA_DEPRECATE("In next RAJA release, TypedListSegment ctor will require a camp Resource object") explicit TypedListSegment(const Container& container) - : m_data(nullptr), m_size(container.size()), m_owned(Unowned) + : m_resource(camp::resources::Resource{camp::resources::Host()}), + m_use_resource(false), + m_owned(Unowned), m_data(nullptr), m_size(container.size()) { - if (m_size <= 0) return; - allocate_and_copy(container); - m_owned = Owned; + if (m_size > 0) { + allocate_and_copy(container); + m_owned = Owned; + } } /// /// Copy-constructor for list segment. /// TypedListSegment(const TypedListSegment& other) + : m_resource(other.m_resource), m_use_resource(other.m_use_resource), + m_owned(Unowned), m_data(nullptr), m_size(0) { - // future TODO: switch to member initialization list ... somehow - initIndexData(other.m_data, other.m_size, other.m_owned); + bool from_copy_ctor = true; + initIndexData(other.m_use_resource, + other.m_data, other.m_size, other.m_owned, from_copy_ctor); } /// /// Move-constructor for list segment. /// TypedListSegment(TypedListSegment&& rhs) - : m_data(rhs.m_data), m_size(rhs.m_size), m_owned(rhs.m_owned) + : m_resource(rhs.m_resource), m_use_resource(rhs.m_use_resource), + m_owned(rhs.m_owned), m_data(rhs.m_data), m_size(rhs.m_size) { // make the rhs non-owning so it's destructor won't have any side effects rhs.m_owned = Unowned; @@ -221,8 +321,15 @@ class TypedListSegment /// ~TypedListSegment() { - if (m_data == nullptr || m_owned != Owned) return; - deallocate(std::integral_constant()); + if (m_data != nullptr && m_owned == Owned) { + + if (m_use_resource) { + m_resource.deallocate(m_data); + } else { + deallocate(std::integral_constant()); + } + + } } @@ -231,6 +338,8 @@ class TypedListSegment /// RAJA_HOST_DEVICE void swap(TypedListSegment& other) { + camp::safe_swap(m_resource, other.m_resource); + camp::safe_swap(m_use_resource, other.m_use_resource); camp::safe_swap(m_data, other.m_data); camp::safe_swap(m_size, other.m_size); camp::safe_swap(m_owned, other.m_owned); @@ -241,6 +350,7 @@ class TypedListSegment //! accessor to get the begin iterator for a TypedListSegment RAJA_HOST_DEVICE iterator begin() const { return m_data; } + //! accessor to retrieve the total number of elements in a TypedListSegment RAJA_HOST_DEVICE Index_type size() const { return m_size; } @@ -281,34 +391,77 @@ class TypedListSegment // Initialize segment data properly based on whether object // owns the index data. // - void initIndexData(const value_type* container, + void initIndexData(bool use_resource, + const value_type* container, Index_type len, - IndexOwnership container_own) + IndexOwnership container_own, + bool from_copy_ctor = false) { - // empty + + // empty list segment if (len <= 0 || container == nullptr) { m_data = nullptr; m_size = 0; m_owned = Unowned; return; } - // some size -- initialize accordingly + + // some non-zero size -- initialize accordingly m_size = len; m_owned = container_own; if (m_owned == Owned) { - allocate_and_copy(RAJA::impl::make_span(container, len)); + + if (use_resource) { + + if ( from_copy_ctor ) { + + m_data = m_resource.allocate(m_size); + m_resource.memcpy(m_data, container, sizeof(value_type) * m_size); + + } else { + + camp::resources::Resource host_res{camp::resources::Host()}; + + value_type* tmp = host_res.allocate(m_size); + + for (Index_type i = 0; i < m_size; ++i) { + tmp[i] = container[i]; + } + + m_data = m_resource.allocate(m_size); + m_resource.memcpy(m_data, tmp, sizeof(value_type) * m_size); + + host_res.deallocate(tmp); + + } + + } else { + allocate_and_copy(RAJA::make_span(container, len)); + } + return; } + + // list segment accesses container data directly. // Uh-oh. Using evil const_cast.... m_data = const_cast(container); } - //! buffer storage for list data + + // Copy of camp resource passed to ctor + camp::resources::Resource m_resource; + + // Boolean indicating whether camp resource is used to manage index data + bool m_use_resource; + + // ownership flag to guide data copying/management + IndexOwnership m_owned; + + // buffer storage for list data value_type* RAJA_RESTRICT m_data; - //! size of list segment + + // size of list segment Index_type m_size; - //! ownership flag to guide data copying/management - IndexOwnership m_owned; }; //! alias for A TypedListSegment with storage type @Index_type diff --git a/include/RAJA/index/RangeSegment.hpp b/include/RAJA/index/RangeSegment.hpp index 39db675e89..df9e187f1b 100644 --- a/include/RAJA/index/RangeSegment.hpp +++ b/include/RAJA/index/RangeSegment.hpp @@ -69,8 +69,12 @@ namespace RAJA * ****************************************************************************** */ -template > + +template >> struct TypedRangeSegment { + + static_assert(std::is_signed::value, "TypedRangeSegment DiffT requires signed type."); + static_assert(!std::is_floating_point::value, "TypedRangeStrideSegment Type must be non floating point."); //! the underlying iterator type using iterator = Iterators::numeric_iterator; @@ -80,15 +84,17 @@ struct TypedRangeSegment { */ using value_type = StorageT; - using IndexType = StorageT; + using IndexType = DiffT; //! construct a TypedRangeSegment from a begin and end value /*! * \param[in] begin the starting value (inclusive) for the range * \param[in] end the ending value (exclusive) for the range */ - RAJA_HOST_DEVICE constexpr TypedRangeSegment(DiffT begin, DiffT end) - : m_begin(iterator{begin}), m_end(iterator{end}) + using StripStorageT = strip_index_type_t; + RAJA_HOST_DEVICE constexpr TypedRangeSegment(StripStorageT begin, StripStorageT end) + : m_begin(iterator(begin)), + m_end(begin > end ? m_begin : iterator(end)) { } @@ -145,18 +151,18 @@ struct TypedRangeSegment { /*! * \return the range (end - begin) of this Segment */ - RAJA_HOST_DEVICE RAJA_INLINE StorageT size() const { return m_end - m_begin; } + RAJA_HOST_DEVICE RAJA_INLINE DiffT size() const { return m_end - m_begin; } //! Create a slice of this instance as a new instance /*! * \return A new instance spanning *begin() + begin to *begin() + begin + * length */ - RAJA_HOST_DEVICE RAJA_INLINE TypedRangeSegment slice(DiffT begin, + RAJA_HOST_DEVICE RAJA_INLINE TypedRangeSegment slice(StorageT begin, DiffT length) const { - auto start = m_begin[0] + begin; - auto end = start + length > m_end[0] ? m_end[0] : start + length; + StorageT start = m_begin[0] + begin; + StorageT end = start + length > m_end[0] ? m_end[0] : start + length; return TypedRangeSegment{stripIndexType(start), stripIndexType(end)}; } @@ -172,6 +178,12 @@ struct TypedRangeSegment { return m_begin == o.m_begin && m_end == o.m_end; } + + RAJA_HOST_DEVICE RAJA_INLINE bool operator!=(TypedRangeSegment const& o) const + { + return !(operator==(o)); + } + private: //! member variable for begin iterator iterator m_begin; @@ -240,9 +252,12 @@ struct TypedRangeSegment { * ****************************************************************************** */ -template > +template >> struct TypedRangeStrideSegment { + static_assert(std::is_signed::value, "TypedRangeStrideSegment DiffT requires signed type."); + static_assert(!std::is_floating_point::value, "TypedRangeStrideSegment Type must be non floating point."); + //! the underlying iterator type using iterator = Iterators::strided_numeric_iterator; @@ -252,15 +267,16 @@ struct TypedRangeStrideSegment { */ using value_type = StorageT; - using IndexType = StorageT; + using IndexType = DiffT; //! construct a TypedRangeStrideSegment from a begin and end value /*! * \param[in] begin the starting value (inclusive) for the range * \param[in] end the ending value (exclusive) for the range * \param[in] stride the increment value for the iteration of the range */ - RAJA_HOST_DEVICE TypedRangeStrideSegment(DiffT begin, - DiffT end, + using StripStorageT = strip_index_type_t; + RAJA_HOST_DEVICE TypedRangeStrideSegment(StripStorageT begin, + StripStorageT end, DiffT stride) : m_begin(iterator(begin, stride)), m_end(iterator(end, stride)), @@ -272,6 +288,13 @@ struct TypedRangeStrideSegment { // (stride > 0 ? value_type{1} : value_type{-1})) / // static_cast(stride)) { + // clamp range when the end is unreachable from the beginning without + // wrapping + if (stride < 0 && end > begin) { + m_end = m_begin; + } else if (stride > 0 && end < begin) { + m_end = m_begin; + } // if m_size was initialized as negative, that indicates a zero iteration // space m_size = m_size < DiffT{0} ? DiffT{0} : m_size; @@ -344,12 +367,12 @@ struct TypedRangeStrideSegment { * \return A new instance spanning *begin() + begin * stride to *begin() + * (begin + length) * stride */ - RAJA_HOST_DEVICE TypedRangeStrideSegment slice(DiffT begin, + RAJA_HOST_DEVICE TypedRangeStrideSegment slice(StorageT begin, DiffT length) const { - auto stride = m_begin.get_stride(); - auto start = m_begin[0] + begin * stride; - auto end = start + stride * length; + StorageT stride = m_begin.get_stride(); + StorageT start = m_begin[0] + begin * stride; + StorageT end = start + stride * length; if (stride > 0) { end = end > m_end[0] ? m_end[0] : end; @@ -440,12 +463,14 @@ RAJA_HOST_DEVICE TypedRangeSegment make_range(BeginT&& begin, template > + typename Common = detail::common_type_t> RAJA_HOST_DEVICE TypedRangeStrideSegment make_strided_range( BeginT&& begin, EndT&& end, StrideT&& stride) { + static_assert(std::is_signed::value, "make_strided_segment : stride must be signed."); + static_assert(std::is_same, StrideT>::value, "make_stride_segment : stride and end must be of similar types."); return {begin, end, stride}; } diff --git a/include/RAJA/internal/Iterators.hpp b/include/RAJA/internal/Iterators.hpp index 2406bedfbe..541519f860 100644 --- a/include/RAJA/internal/Iterators.hpp +++ b/include/RAJA/internal/Iterators.hpp @@ -18,12 +18,15 @@ #ifndef RAJA_ITERATORS_HPP #define RAJA_ITERATORS_HPP -#include "RAJA/config.hpp" - #include +#include +#include #include +#include #include +#include "RAJA/config.hpp" +#include "RAJA/index/IndexValue.hpp" #include "RAJA/util/macros.hpp" #include "RAJA/util/types.hpp" @@ -34,6 +37,69 @@ namespace Iterators // Containers +#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG) +template +std::string overflow_msg(LType lhs, RType rhs) +{ + return "Iterator Overflow detected between operation of :\n\ttype : " + + (std::string) typeid(lhs).name() + " val : " + std::to_string(lhs) + + "\n\ttype : " + typeid(rhs).name() + " val : " + std::to_string(rhs) + + "\n"; +} + +template +RAJA_HOST_DEVICE bool is_addition_overflow(Type lhs, DifferenceType rhs) +{ + if (std::is_unsigned::value) { + if ((rhs > 0) && (lhs > std::numeric_limits::max() - rhs)) + return true; + if ((rhs < 0) && (lhs < std::numeric_limits::min() - rhs)) + return true; + } + return false; +} + +template +RAJA_HOST_DEVICE bool is_subtraction_overflow(Type lhs, + DifferenceType rhs, + bool iterator_on_left = true) +{ + if (iterator_on_left) { + + if (std::is_unsigned::value) { + if ((rhs > 0) && (lhs < std::numeric_limits::min() + rhs)) + return true; + if ((rhs < 0) && (lhs > std::numeric_limits::max() + rhs)) + return true; + } + + } else { // Special case where operation is : value(lhs) - iterator(rhs). + + if (std::is_unsigned::value) { + if ((lhs > 0) && (rhs < std::numeric_limits::min() + lhs)) + return true; + if ((lhs < 0)) return true; + } + } + return false; +} + +template +RAJA_HOST_DEVICE void check_is_addition_overflow(Type lhs, DifferenceType rhs) +{ + if (is_addition_overflow(lhs, rhs)) + throw std::runtime_error(overflow_msg(lhs, rhs)); +} + +template +RAJA_HOST_DEVICE void check_is_subtraction_overflow(Type lhs, + DifferenceType rhs) +{ + if (is_subtraction_overflow(lhs, rhs)) + throw std::runtime_error(overflow_msg(lhs, rhs)); +} +#endif + template @@ -41,20 +107,35 @@ class numeric_iterator { public: using value_type = Type; + using stripped_value_type = strip_index_type_t; using difference_type = DifferenceType; using pointer = PointerType; using reference = value_type&; using iterator_category = std::random_access_iterator_tag; - RAJA_HOST_DEVICE constexpr numeric_iterator() : val(0) {} - RAJA_HOST_DEVICE constexpr numeric_iterator(const difference_type& rhs) - : val(rhs) + RAJA_HOST_DEVICE constexpr numeric_iterator() {} + RAJA_HOST_DEVICE constexpr numeric_iterator(const numeric_iterator& rhs) + : val(rhs.val) { } - RAJA_HOST_DEVICE constexpr numeric_iterator(const numeric_iterator& rhs) + RAJA_HOST_DEVICE constexpr numeric_iterator(numeric_iterator&& rhs) : val(rhs.val) { } + RAJA_HOST_DEVICE numeric_iterator& operator=(const numeric_iterator& rhs) + { + val = rhs.val; + return *this; + } + RAJA_HOST_DEVICE numeric_iterator& operator=(numeric_iterator&& rhs) + { + val = rhs.val; + return *this; + } + RAJA_HOST_DEVICE constexpr numeric_iterator(const stripped_value_type& rhs) + : val(rhs) + { + } RAJA_HOST_DEVICE inline DifferenceType get_stride() const { return 1; } @@ -109,12 +190,18 @@ class numeric_iterator RAJA_HOST_DEVICE inline numeric_iterator& operator+=( const difference_type& rhs) { +#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG) + check_is_addition_overflow(val, rhs); +#endif val += rhs; return *this; } RAJA_HOST_DEVICE inline numeric_iterator& operator-=( const difference_type& rhs) { +#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG) + check_is_subtraction_overflow(val, rhs); +#endif val -= rhs; return *this; } @@ -131,12 +218,12 @@ class numeric_iterator return *this; } - RAJA_HOST_DEVICE inline difference_type operator+( + RAJA_HOST_DEVICE inline stripped_value_type operator+( const numeric_iterator& rhs) const { return val + rhs.val; } - RAJA_HOST_DEVICE inline difference_type operator-( + RAJA_HOST_DEVICE inline stripped_value_type operator-( const numeric_iterator& rhs) const { return val - rhs.val; @@ -144,24 +231,42 @@ class numeric_iterator RAJA_HOST_DEVICE inline numeric_iterator operator+( const difference_type& rhs) const { +#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG) + check_is_addition_overflow(val, rhs); +#endif return numeric_iterator(val + rhs); } RAJA_HOST_DEVICE inline numeric_iterator operator-( const difference_type& rhs) const { +#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG) + check_is_subtraction_overflow(val, rhs); +#endif return numeric_iterator(val - rhs); } RAJA_HOST_DEVICE friend constexpr numeric_iterator operator+( difference_type lhs, const numeric_iterator& rhs) { +#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG) + return is_addition_overflow(rhs.val, lhs) + ? throw std::runtime_error(overflow_msg(lhs, rhs.val)) + : numeric_iterator(lhs + rhs.val); +#else return numeric_iterator(lhs + rhs.val); +#endif } RAJA_HOST_DEVICE friend constexpr numeric_iterator operator-( difference_type lhs, const numeric_iterator& rhs) { +#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG) + return is_subtraction_overflow(rhs.val, lhs, false) + ? throw std::runtime_error(overflow_msg(lhs, rhs.val)) + : numeric_iterator(lhs - rhs.val); +#else return numeric_iterator(lhs - rhs.val); +#endif } RAJA_HOST_DEVICE inline value_type operator*() const @@ -178,7 +283,7 @@ class numeric_iterator } private: - difference_type val; + stripped_value_type val = 0; }; template ; using difference_type = DifferenceType; using pointer = DifferenceType*; using reference = DifferenceType&; using iterator_category = std::random_access_iterator_tag; - RAJA_HOST_DEVICE constexpr strided_numeric_iterator() : val(0), stride(1) {} - + RAJA_HOST_DEVICE constexpr strided_numeric_iterator() {} RAJA_HOST_DEVICE constexpr strided_numeric_iterator( - DifferenceType rhs, - DifferenceType stride_ = DifferenceType(1)) - : val(rhs), stride(stride_) + const strided_numeric_iterator& rhs) + : val(rhs.val), stride(rhs.stride) + { + } + RAJA_HOST_DEVICE constexpr strided_numeric_iterator(strided_numeric_iterator&& rhs) + : val(rhs.val), stride(rhs.stride) + { + } + RAJA_HOST_DEVICE strided_numeric_iterator& operator=( + const strided_numeric_iterator& rhs) + { + val = rhs.val; + stride = rhs.stride; + return *this; + } + RAJA_HOST_DEVICE strided_numeric_iterator& operator=( + strided_numeric_iterator&& rhs) { + val = rhs.val; + stride = rhs.stride; + return *this; } RAJA_HOST_DEVICE constexpr strided_numeric_iterator( - const strided_numeric_iterator& rhs) - : val(rhs.val), stride(rhs.stride) + stripped_value_type rhs, + DifferenceType stride_ = DifferenceType(1)) + : val(rhs), stride(stride_) { } @@ -224,12 +347,18 @@ class strided_numeric_iterator RAJA_HOST_DEVICE inline strided_numeric_iterator& operator+=( const difference_type& rhs) { +#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG) + check_is_addition_overflow(val, rhs * stride); +#endif val += rhs * stride; return *this; } RAJA_HOST_DEVICE inline strided_numeric_iterator& operator-=( const difference_type& rhs) { +#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG) + check_is_subtraction_overflow(val, rhs * stride); +#endif val -= rhs * stride; return *this; } @@ -254,11 +383,17 @@ class strided_numeric_iterator RAJA_HOST_DEVICE inline strided_numeric_iterator operator+( const difference_type& rhs) const { +#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG) + check_is_addition_overflow(val, rhs * stride); +#endif return strided_numeric_iterator(val + rhs * stride, stride); } RAJA_HOST_DEVICE inline strided_numeric_iterator operator-( const difference_type& rhs) const { +#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG) + check_is_subtraction_overflow(val, rhs * stride); +#endif return strided_numeric_iterator(val - rhs * stride, stride); } @@ -311,8 +446,8 @@ class strided_numeric_iterator } private: - DifferenceType val; - DifferenceType stride; + stripped_value_type val = 0; + DifferenceType stride = 1; }; diff --git a/include/RAJA/internal/LegacyCompatibility.hpp b/include/RAJA/internal/LegacyCompatibility.hpp deleted file mode 100644 index 73954eb734..0000000000 --- a/include/RAJA/internal/LegacyCompatibility.hpp +++ /dev/null @@ -1,315 +0,0 @@ -/*! - ****************************************************************************** - * - * \file - * - * \brief Header file with support for pre-C++14 compilers. - * - ****************************************************************************** - */ - -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC -// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#ifndef RAJA_LEGACY_COMPATIBILITY_HPP -#define RAJA_LEGACY_COMPATIBILITY_HPP - -#include "RAJA/config.hpp" - -#include -#include -#include -#include -#include - -#include "camp/camp.hpp" - -#include "RAJA/util/macros.hpp" - -#if (!defined(__INTEL_COMPILER)) && (!defined(RAJA_COMPILER_MSVC)) -static_assert(__cplusplus >= 201103L, - "C++ standards below 2011 are not " - "supported" RAJA_STRINGIFY_HELPER(__cplusplus)); -#endif - -#if __cplusplus > 201400L -#define RAJA_CXX14_CONSTEXPR constexpr -#else -#define RAJA_CXX14_CONSTEXPR -#endif - -// #if defined(RAJA_USE_CUDA) -// #include -// namespace VarOps { -// using thrust::tuple; -// using thrust::tuple_element; -// using thrust::get; -// using thrust::tuple_size; -// using thrust::make_tuple; -// } -// #else -#include -#include -namespace VarOps -{ -using std::get; -using std::make_tuple; -using std::tuple; -using std::tuple_cat; -using std::tuple_element; -using std::tuple_size; -} // namespace VarOps -// #endif - -namespace VarOps -{ - -// Basics, using c++14 semantics in a c++11 compatible way, credit to libc++ - -// Forward - -// FoldL -template -struct foldl_impl; - -template -struct foldl_impl { - using Ret = Arg1; -}; - -template -struct foldl_impl { - using Ret = typename std::result_of::type; -}; - -template -struct foldl_impl { - using Ret = typename foldl_impl< - Op, - typename std::result_of::type, - Arg3)>::type, - Rest...>::Ret; -}; - -template -RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl( - Op&& RAJA_UNUSED_ARG(operation), - Arg1&& arg) -> typename foldl_impl::Ret -{ - return camp::forward(arg); -} - -template -RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation, - Arg1&& arg1, - Arg2&& arg2) -> - typename foldl_impl::Ret -{ - return camp::forward(operation)(camp::forward(arg1), - camp::forward(arg2)); -} - -template -RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation, - Arg1&& arg1, - Arg2&& arg2, - Arg3&& arg3, - Rest&&... rest) -> - typename foldl_impl::Ret -{ - return foldl(camp::forward(operation), - camp::forward(operation)( - camp::forward(operation)(camp::forward(arg1), - camp::forward(arg2)), - camp::forward(arg3)), - camp::forward(rest)...); -} - - -// Convenience folds -template -RAJA_HOST_DEVICE RAJA_INLINE constexpr Result sum(Args... args) -{ - return foldl(RAJA::operators::plus(), args...); -} - -template -RAJA_HOST_DEVICE RAJA_INLINE constexpr Result max(Args... args) -{ - return foldl(RAJA::operators::maximum(), args...); -} - -template -RAJA_HOST_DEVICE RAJA_INLINE constexpr Result min(Args... args) -{ - return foldl(RAJA::operators::minimum(), args...); -} - -// template -// struct product_first_n; -// -// template -// struct product_first_n{ -// static Result value = 1; -// template -// constexpr product_first_n(Args...args) : value{1} { } -// }; -// -// template -// struct product_first_n{ -// static Result value = product_first_n(args...)::value; -// template -// constexpr product_first_n(FirstArg arg1, Args...args) -// : value() { } -// }; - -template