diff --git a/.gitignore b/.gitignore
index 1c0750848b..f4f1cd0dc1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,11 @@
 *.a
 *.exe
 *.gch
-build/
-build-*/
+/*.sublime-*
+/build/
+/build_*/
+/build-*/
+/install/
+/install_*/
+/install-*/
 /Debug/
diff --git a/.travis.yml b/.travis.yml
index dba6d536dd..23e1ea44e1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -29,28 +29,17 @@ matrix:
     env:
     - COMPILER=g++
     - IMG=gcc8
-    - CMAKE_EXTRA_FLAGS="-DENABLE_WARNINGS=On -DENABLE_TBB=On"
+    - CMAKE_EXTRA_FLAGS="-DENABLE_WARNINGS=On -DENABLE_TBB=On -DRAJA_ENABLE_BOUNDS_CHECK=ON"
   - compiler: clang9
     env:
     - COMPILER=clang++-9
     - IMG=clang9
     - CMAKE_EXTRA_FLAGS="-DCMAKE_CXX_FLAGS=-fmodules -DENABLE_TBB=On"
-  - compiler: clang5
-    env:
-    - COMPILER=clang++
-    - IMG=clang5
-    - CMAKE_EXTRA_FLAGS="-DCMAKE_CXX_FLAGS=-fmodules -DENABLE_TBB=On"
   - compiler: intel18
     env:
     - COMPILER=/opt/intel/bin/icpc
     - IMG=icc18
-    - CMAKE_EXTRA_FLAGS="-DENABLE_TBB=On"
-  - compiler: nvcc9
-    env:
-    - COMPILER=g++
-    - IMG=nvcc9
-    - CMAKE_EXTRA_FLAGS="-DENABLE_CUDA=On -DENABLE_TBB=On"
-    - DO_TEST=no
+    - CMAKE_EXTRA_FLAGS="-DENABLE_FORCEINLINE_RECURSIVE=Off -DENABLE_TBB=On"
   - compiler: nvcc10.2
     env:
     - COMPILER=g++
@@ -86,7 +75,7 @@ matrix:
       - COMPILER=g++
       - IMG=hip
       - HCC_AMDGPU_TARGET=gfx900
-      - CMAKE_EXTRA_FLAGS="-DENABLE_HIP=On -DENABLE_OPENMP=Off -DENABLE_CUDA=Off -DENABLE_WARNINGS_AS_ERRORS=Off"
+      - CMAKE_EXTRA_FLAGS="-DENABLE_HIP=On -DENABLE_OPENMP=Off -DENABLE_CUDA=Off -DENABLE_WARNINGS_AS_ERRORS=Off -DHIP_HIPCC_FLAGS=-fPIC"
       - DO_TEST=no
 
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e2b9d48137..e85ed1f485 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,9 +8,13 @@
 cmake_policy(SET CMP0042 NEW)
 cmake_policy(SET CMP0048 NEW)
 
+if (APPLE)
+ cmake_policy(SET CMP0025 NEW)
+endif()
+
 # Set version number
 set(RAJA_VERSION_MAJOR 0)
-set(RAJA_VERSION_MINOR 11)
+set(RAJA_VERSION_MINOR 12)
 set(RAJA_VERSION_PATCHLEVEL 0)
 
 if (RAJA_LOADED AND (NOT RAJA_LOADED STREQUAL "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}"))
@@ -47,7 +51,6 @@ set(ENABLE_GTEST_DEATH_TESTS On CACHE BOOL "Enable tests asserting failure.")
 set(RAJA_CXX_STANDARD_FLAG "default" CACHE STRING "Specific c++ standard flag to use, default attempts to autodetect the highest available")
 
 option(ENABLE_TBB "Build TBB support" Off)
-option(ENABLE_CHAI "Build CHAI support" Off)
 option(ENABLE_TARGET_OPENMP "Build OpenMP on target device support" Off)
 option(ENABLE_CLANG_CUDA "Use Clang's native CUDA support" Off)
 option(ENABLE_EXTERNAL_CUB "Use an external cub for scans" Off)
@@ -64,6 +67,7 @@ option(ENABLE_FORCEINLINE_RECURSIVE "Enable Forceinline recursive (only supporte
 option(ENABLE_BENCHMARKS "Build benchmarks" Off)
 option(RAJA_DEPRECATED_TESTS "Test deprecated features" Off)
 option(RAJA_ENABLE_BOUNDS_CHECK "Enable bounds checking in RAJA::Views/Layouts" Off)
+option(RAJA_TEST_EXHAUSTIVE "Build RAJA exhaustive tests" Off)
 
 set(TEST_DRIVER "" CACHE STRING "driver used to wrap test commands")
 
@@ -72,7 +76,7 @@ cmake_minimum_required(VERSION 3.9)
 if (ENABLE_CUDA)
   if (DEFINED CUDA_ARCH)
     if (CUDA_ARCH MATCHES "^sm_*")
-      if ("${CUDA_ARCH}" STRLESS "sm_35") 
+      if ("${CUDA_ARCH}" STRLESS "sm_35")
         message( FATAL_ERROR "RAJA requires minimum CUDA compute architecture of sm_35")
       endif()
     endif()
@@ -85,7 +89,7 @@ if (ENABLE_CUDA)
     message(STATUS "CUDA compute architecture set to RAJA default sm_35 since it was not specified")
     set(CUDA_ARCH "sm_35" CACHE STRING "Set CUDA_ARCH to RAJA minimum supported" FORCE)
   endif()
-  if (CMAKE_CXX_COMPILER_ID MATCHES GNU)
+  if ( (CMAKE_CXX_COMPILER_ID MATCHES GNU) AND (CMAKE_SYSTEM_PROCESSOR MATCHES ppc64le) )
     if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.0)
       set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -mno-float128")
     endif ()
@@ -157,7 +161,9 @@ set (raja_sources
   src/LockFreeIndexSetBuilders.cpp
   src/MemUtils_CUDA.cpp
   src/MemUtils_HIP.cpp
-  src/PluginStrategy.cpp)
+  src/PluginStrategy.cpp
+  src/RuntimePluginLoader.cpp
+  src/KokkosPluginLoader.cpp)
 
 set (raja_depends)
 
@@ -182,7 +188,7 @@ if (ENABLE_CUDA)
     if (CUB_FOUND)
       blt_register_library(
         NAME cub
-        INCLUDES ${CUB_INCLUDE_DIRS})
+        INCLUDES $<BUILD_INTERFACE:${CUB_INCLUDE_DIRS}>)
       set(raja_depends
         ${raja_depends}
         cub)
@@ -216,38 +222,34 @@ if (ENABLE_HIP)
   endif ()
 endif ()
 
-if (ENABLE_CHAI)
-  set (raja_depends
-    ${raja_depends}
-    chai)
-endif ()
-
 if (ENABLE_TBB)
   set(raja_depends
     ${raja_depends}
     tbb)
 endif ()
 
-set(EXTERNAL_CAMP_SOURCE_DIR "" CACHE FILEPATH "build with a specific external
+if (NOT TARGET camp)
+  set(EXTERNAL_CAMP_SOURCE_DIR "" CACHE FILEPATH "build with a specific external
 camp source repository")
-if (EXTERNAL_CAMP_SOURCE_DIR)
-  message(STATUS "Using external source CAMP from: " ${EXTERNAL_CAMP_SOURCE_DIR})
-  add_subdirectory(${EXTERNAL_CAMP_SOURCE_DIR}
-    ${CMAKE_CURRENT_BINARY_DIR}/tpl/camp)
-else (EXTERNAL_CAMP_SOURCE_DIR)
-  find_package(camp QUIET)
-  if (NOT camp_FOUND)
-    message(STATUS "Using RAJA CAMP submodule.")
-    add_subdirectory(tpl/camp)
-  else (NOT camp_FOUND)
-    message(STATUS "Using installed CAMP from:  ${camp_INSTALL_PREFIX}")
-  endif(NOT camp_FOUND)
-endif (EXTERNAL_CAMP_SOURCE_DIR)
+  if (EXTERNAL_CAMP_SOURCE_DIR)
+    message(STATUS "Using external source CAMP from: " ${EXTERNAL_CAMP_SOURCE_DIR})
+    add_subdirectory(${EXTERNAL_CAMP_SOURCE_DIR}
+                     ${CMAKE_CURRENT_BINARY_DIR}/tpl/camp)
+  else (EXTERNAL_CAMP_SOURCE_DIR)
+    find_package(camp QUIET)
+    if (NOT camp_FOUND)
+      message(STATUS "Using RAJA CAMP submodule.")
+      add_subdirectory(tpl/camp)
+    else (NOT camp_FOUND)
+      message(STATUS "Using installed CAMP from:  ${camp_INSTALL_PREFIX}")
+    endif(NOT camp_FOUND)
+  endif (EXTERNAL_CAMP_SOURCE_DIR)
+endif (NOT TARGET camp)
 
 blt_add_library(
   NAME RAJA
   SOURCES ${raja_sources}
-  DEPENDS_ON ${raja_depends} camp)
+  DEPENDS_ON ${raja_depends} camp ${CMAKE_DL_LIBS})
 
 install(TARGETS RAJA
   EXPORT RAJA
@@ -262,9 +264,11 @@ target_include_directories(RAJA
   PUBLIC
   $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
   $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
-  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tpl/cub>
-  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tpl/rocPRIM/rocprim/include>
   $<INSTALL_INTERFACE:include>)
+target_include_directories(RAJA SYSTEM
+  PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tpl/cub>
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tpl/rocPRIM/rocprim/include>)
 
 install(DIRECTORY include/ DESTINATION include FILES_MATCHING PATTERN *.hpp)
 if(NOT ENABLE_EXTERNAL_CUB)
diff --git a/README.md b/README.md
index 6f0ff8f405..2dd606b396 100644
--- a/README.md
+++ b/README.md
@@ -71,14 +71,14 @@ submodule or as an installed library.
 User Documentation
 -------------------
 
-The [**RAJA User Guide and Tutorial**](http://raja.readthedocs.io/en/master/) 
+The [**RAJA User Guide and Tutorial**](http://raja.readthedocs.io/en/main/) 
 is the best place to start learning about RAJA and how to use it.
 
 To cite RAJA, please use the following references:
 
 * RAJA Performance Portability Layer. https://github.com/LLNL/RAJA
 
-* D. A. Beckingsale, J. Burmark, R. Hornung, H. Jones, W. Killian, A. J. Kunen, O. Pearce, P. Robinson, B. S. Ryujin, T. R. W. Scogland, "RAJA: Porrtable Performance for Large-Scale Scientific Applications", 2019 IEEE/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC). [Download here](https://conferences.computer.org/sc19w/2019/#!/toc/14)
+* D. A. Beckingsale, J. Burmark, R. Hornung, H. Jones, W. Killian, A. J. Kunen, O. Pearce, P. Robinson, B. S. Ryujin, T. R. W. Scogland, "RAJA: Portable Performance for Large-Scale Scientific Applications", 2019 IEEE/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC). [Download here](https://conferences.computer.org/sc19w/2019/#!/toc/14)
 
 Related Software
 --------------------
@@ -114,7 +114,7 @@ The RAJA team follows the [GitFlow](http://nvie.com/posts/a-successful-git-branc
 include their work in a feature branch created from the RAJA `develop` branch.
 Then, create a pull request with the `develop` branch as the destination. That
 branch contains the latest work in RAJA. Periodically, we will merge the 
-develop branch into the `master` branch and tag a new release.
+develop branch into the `main` branch and tag a new release.
 
 Authors
 -----------
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index b4a5466baa..4ffff68c02 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -10,6 +10,110 @@
 Version vxx.yy.zz -- Release date 20yy-mm-dd
 ============================================
 
+Version v0.12.0 -- Release date 2020-09-03
+============================================
+
+This release contains new features, notable changes, and bug fixes. Please
+see the RAJA user guide for more information about items in this release.
+
+Notable changes include:
+
+  * Notable repository change:
+      * The 'master' branch in the RAJA git repo has been renamed to 'main'.
+
+  * New features:
+      * New RAJA "work group" capability added. This allows multiple GPU
+        kernels to be fused into one kernel launch, greatly reducing the
+        run time overhead of launching CUDA kernels.
+      * Added support for dynamic plug-ins in RAJA, which enable the use of
+        things like Kokkos Performance Profiline Tools to be used with RAJA
+        (https://github.com/kokkos/kokkos-tools)
+      * Added ability to pass a resource object to RAJA::forall methods to
+        enable asynchronous execution for CUDA and HIP back-ends.
+      * Added "Multi-view" that works like a regular view, except that it
+        can wrap multiple arrays so their accesses can share index arithmetic.
+      * Multiple sort algorithms added. This provides portable parallel sort 
+        operations, which are basic parallel algorithm building blocks.
+      * Introduced RAJA "Teams" concept as an experimental feature. This
+        enables hierarchical parallelism and additional nested loop patterns
+        beyond what RAJA::kernel supports. Please note that this is very much
+        a work-in-progress and is not yet documented in the user guide.
+      * Added initial support for dynamic loop tiling.
+      * New OpenMP execution policies added to support static, dynamic, and 
+        guided scheduling.
+      * Added support for const iterators to be used with RAJA scans.
+      * Support for bitwise and and or reductions have been added.
+      * The RAJA::kernel interface has been expanded to allow only segment 
+        index arguments used in a lambda to be passed to the lambda. In 
+        previous versions of RAJA, every lambda invoked in a kernel had to 
+        accept an index argument for every segment in the segment tuple passed 
+        to RAJA::kernel execution templates, even if not all segment indices 
+        were used in a lambda. This release still allows that usage pattern.
+        The new capability requires an additional template parameter to be 
+        passed to the RAJA::statement::Lambda type, which identify the segment 
+        indices that will be passed and in which order.
+     
+  * API Changes:
+      * The RAJA 'VarOps' namespace has been removed. All entities previously
+        in that namespace are now in the 'RAJA' namespace.
+      * RAJA span is now public for users to access and has been made more like
+        std::span.
+      * RAJA::statement::tile_fixed has been moved to RAJA::tile_fixed
+        (namespace change).
+      * RAJA::statement::{Segs, Offsets, Params, ValuesT} have been moved to
+        RAJA::{Segs, Offsets, Params, ValuesT} (namespace change).
+      * RAJA ListSegment constructors have been expanded to accept a camp
+        Resource object. This enables run time specification of the memory
+        space where the data for list segment indices will live. In earlier
+        RAJA versions, the space in which list segment index data lived was a 
+        compile-time choice based on whether CUDA or HIP was enabled and the 
+        data resided in unified memory for either case. This is still supported
+        in this release, but is marked as a DEPRECATED FEATURE. In the next RAJA
+        release, ListSegment construction will require a camp Resource object.
+        When compiling RAJA with your application, you will see deprecation
+        warnings if you are using the deprecated ListSegment constructor. 
+      * A reset method was added to OpenMP target offload reduction classes
+        so they contain the same functionality as reductions for all other 
+        back-ends.
+
+  * Build changes/improvements:
+      * The BLT, camp, CUB, and rocPRIM submodules have all been updated to 
+        more recent versions. Please note that RAJA now requires rocm version 
+        3.5 or newer to use the HIP back-end.
+      * Build for clang9 on macosx has been fixed.
+      * Build for Intel19 on Windows has been fixed.
+      * Host/device annotations have been added to reduction operations to
+        eliminate compiler warnings for certain use cases.
+      * Several warnings generated by the MSVC compiler have been eliminated.
+      * A couple of PGI compiler warnings have been removed.
+      * CMake improvements to make it is easier to use an external camp or 
+        CUB library with RAJA. 
+      * Note that the RAJA tests are undergoing a substantial overhaul. Users,
+        who chose to build and run RAJA tests, should know that many tests
+        are now being generated in the build space directory structure which
+        mimics the RAJA source directory structure. As a result, only some
+        test executables appear in the top-level 'test' subdirectory of the 
+        build directory; others can be found in lower-level directories. The
+        reason for this change is to reduce test build times for certain 
+        compilers.
+
+  * Bug fixes:
+      * An issue with SIMD privatization with the Intel compiler, required
+        to generate correct code, has been fixed.
+      * An issue with the atomicExchange() operation for the RAJA HIP back-end
+        has been fixed.
+      * A type issue in the RAJA::kernel implementation involving RAJA span
+        usage has been fixed.
+      * Checks for iterator ranges and container sizes have been added to
+        RAJA scans, which fixes an issue when users attempted to run a 
+        scan over a range of size zero.
+      * Several type errors in the Layout.hpp header file have been fixed.
+      * Several fixes have been made in the Layout and Static Layout types.
+      * Several fixes have been made to the OpenMP target offload back-end
+        to address host-device memory issues.
+      * A variety of RAJA User Guide issues have been addressed, as well as
+        issues in RAJA example codes.
+
 Version v0.11.0 -- Release date 2020-01-29
 ==========================================
 
@@ -85,7 +189,7 @@ Notable changes include:
       * Added a bounds checking option to RAJA Layout types as a debugging
         feature. This is a compile-time option that will report user errors
         when given View or Layout indices are out-of-bounds. See View/Layout
-        section in the RAjA User Guide for instructions on enabling this and 
+        section in the RAJA User Guide for instructions on enabling this and 
         how this feature works. 
       * We've added a RAJA Template Project on GitHub, which shows how to
         use RAJA in an application, either as a Git submodule or as an
diff --git a/blt b/blt
index 2c192774b5..bc20f6ab51 160000
--- a/blt
+++ b/blt
@@ -1 +1 @@
-Subproject commit 2c192774b587c245ec2d7022b2e862395ffa8a21
+Subproject commit bc20f6ab51be6055d8e7ecc3d83e87dc254c7af6
diff --git a/cmake/RAJAMacros.cmake b/cmake/RAJAMacros.cmake
index 70cadfc169..0d26065854 100644
--- a/cmake/RAJAMacros.cmake
+++ b/cmake/RAJAMacros.cmake
@@ -49,6 +49,50 @@ macro(raja_add_executable)
     )
 endmacro(raja_add_executable)
 
+macro(raja_add_plugin_library)
+  set(options )
+  set(singleValueArgs NAME SHARED)
+  set(multiValueArgs SOURCES DEPENDS_ON)
+
+  cmake_parse_arguments(arg
+    "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  list(APPEND arg_DEPENDS_ON RAJA)
+
+  if (ENABLE_OPENMP)
+    list (APPEND arg_DEPENDS_ON openmp)
+  endif ()
+
+  if (ENABLE_CUDA)
+    list (APPEND arg_DEPENDS_ON cuda)
+  endif ()
+
+  if (ENABLE_HIP)
+    list (APPEND arg_DEPENDS_ON hip)
+  endif ()
+
+  if (ENABLE_TBB)
+    list (APPEND arg_DEPENDS_ON tbb)
+  endif ()
+
+  blt_add_library(
+    NAME ${arg_NAME}
+    SOURCES ${arg_SOURCES}
+    DEPENDS_ON ${arg_DEPENDS_ON}
+    SHARED ${arg_SHARED}
+    )
+
+  #target_include_directories(${arg_NAME}
+  #PUBLIC
+  #$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
+  #$<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
+  #$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/tpl/cub>
+  #$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/tpl/camp/include>
+  #$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/tpl/rocPRIM/rocprim/include>
+  #$<INSTALL_INTERFACE:include>)
+
+endmacro(raja_add_plugin_library)
+
 macro(raja_add_test)
   set(options )
   set(singleValueArgs NAME)
diff --git a/cmake/SetupCompilers.cmake b/cmake/SetupCompilers.cmake
index 2c5d4d6f5c..eb9ec9d2f2 100644
--- a/cmake/SetupCompilers.cmake
+++ b/cmake/SetupCompilers.cmake
@@ -41,7 +41,7 @@ if ( MSVC )
 endif()
 
 if (ENABLE_CUDA)
-  set(CMAKE_CUDA_STANDARD 11)
+  set(CMAKE_CUDA_STANDARD "11" CACHE STRING "Version of C++ standard for CUDA Builds")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -restrict -arch ${CUDA_ARCH} --expt-extended-lambda --expt-relaxed-constexpr -Xcudafe \"--display_error_number\"")
 
   if (NOT RAJA_HOST_CONFIG_LOADED)
diff --git a/cmake/SetupRajaConfig.cmake b/cmake/SetupRajaConfig.cmake
index f941fa1578..589ba2f84a 100644
--- a/cmake/SetupRajaConfig.cmake
+++ b/cmake/SetupRajaConfig.cmake
@@ -26,6 +26,7 @@ endif()
 ## Fault tolerance options
 option(ENABLE_FT "Enable fault-tolerance features" OFF)
 option(RAJA_REPORT_FT "Report on use of fault-tolerant features" OFF)
+option(ENABLE_ITERATOR_OVERFLOW_DEBUG "Enable Overflow checking during Iterator operations" OFF)
 
 ## Timer options
 set(RAJA_TIMER "chrono" CACHE STRING
@@ -62,6 +63,8 @@ set(RAJA_ENABLE_CLANG_CUDA ${ENABLE_CLANG_CUDA})
 set(RAJA_ENABLE_HIP ${ENABLE_HIP})
 set(RAJA_ENABLE_CUB ${ENABLE_CUB})
 
+option(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL "Enable use of device function pointers in hip backend" OFF)
+
 # Configure a header file with all the variables we found.
 configure_file(${PROJECT_SOURCE_DIR}/include/RAJA/config.hpp.in
   ${PROJECT_BINARY_DIR}/include/RAJA/config.hpp)
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
index 5ff2012999..642e8db256 100644
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -5,7 +5,7 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-add_custom_target(docs)
+add_custom_target(raja-docs)
 
 if (SPHINX_FOUND)
   add_subdirectory(sphinx/user_guide)
diff --git a/docs/doxygen/CMakeLists.txt b/docs/doxygen/CMakeLists.txt
index a1f30e42a2..2c83933591 100644
--- a/docs/doxygen/CMakeLists.txt
+++ b/docs/doxygen/CMakeLists.txt
@@ -21,5 +21,5 @@ add_custom_target(raja-doxygen
 install(DIRECTORY ${DOXYGEN_HTML_DIR}
   DESTINATION "docs/doxygen/" OPTIONAL)
 
-add_dependencies(docs
+add_dependencies(raja-docs
   raja-doxygen)
diff --git a/docs/sphinx/user_guide/CMakeLists.txt b/docs/sphinx/user_guide/CMakeLists.txt
index 0245504b81..be40a41372 100644
--- a/docs/sphinx/user_guide/CMakeLists.txt
+++ b/docs/sphinx/user_guide/CMakeLists.txt
@@ -23,5 +23,5 @@ add_custom_target(raja-userguide-sphinx
 install(DIRECTORY "${SPHINX_HTML_DIR}"
         DESTINATION "docs/user_guide/sphinx/" OPTIONAL)
 
-add_dependencies(docs
+add_dependencies(raja-docs
   raja-userguide-sphinx)
diff --git a/docs/sphinx/user_guide/conf.py b/docs/sphinx/user_guide/conf.py
index c024360e4a..25b3bb2d91 100644
--- a/docs/sphinx/user_guide/conf.py
+++ b/docs/sphinx/user_guide/conf.py
@@ -66,9 +66,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = u'0.9'
+version = u'0.12'
 # The full version, including alpha/beta/rc tags.
-release = u'0.9.0'
+release = u'0.12.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/sphinx/user_guide/config_options.rst b/docs/sphinx/user_guide/config_options.rst
index c369f94c9b..8570e09d91 100644
--- a/docs/sphinx/user_guide/config_options.rst
+++ b/docs/sphinx/user_guide/config_options.rst
@@ -38,21 +38,24 @@ the top-level RAJA directory::
     $ make install
 
 Following CMake conventions, RAJA supports three build types: ``Release``, 
-``RelWithDebInfo``, and ``Debug``. Similar to other CMake systems, when you
-choose a build type that includes debug information, you do not have to specify
-the '-g' compiler flag to generate debugging symbols. 
+``RelWithDebInfo``, and ``Debug``. With CMake, compiler flags for each of
+these build types are applied automatically and so you do not have to 
+specify them. However, if you want to apply other compiler flags, you will
+need to do that using appropriate CMake variables.
 
-All RAJA options are set like standard CMake variables. All RAJA settings for 
+All RAJA options are set like regular CMake variables. RAJA settings for 
 default options, compilers, flags for optimization, etc. can be found in files 
-in the ``RAJA/cmake`` directory. Configuration variables can be set by passing
+in the ``RAJA/cmake`` directory and top-level ``CMakeLists.txt`` file. 
+Configuration variables can be set by passing
 arguments to CMake on the command line when CMake is called, or by setting
-options in a CMake cache file and passing that file to CMake. For example, 
-to enable RAJA OpenMP functionality, pass the following argument to cmake::
+options in a CMake *cache file* and passing that file to CMake using the 
+CMake ``-C`` options. For example, to enable RAJA OpenMP functionality, 
+pass the following argument to CMake::
 
     -DENABLE_OPENMP=On
 
 The RAJA repository contains a collection of CMake cache files 
-(or 'host-config' files) that may be used as a guide for users trying
+(we call them *host-config* files) that may be used as a guide for users trying
 to set their own options. See :ref:`configopt-raja-hostconfig-label`.
 
 Next, we summarize RAJA options and their defaults.
@@ -80,18 +83,19 @@ and their default settings:
 
 * **Examples, tests, warnings, etc.**
 
-     Variables that control whether RAJA tests and examples are built when
-     the library is compiled are:
+     Variables that control whether RAJA tests, examples, or tutorial
+     exercises are built when RAJA is compiled:
 
       ======================   ======================
       Variable                 Default
       ======================   ======================
       ENABLE_TESTS             On 
       ENABLE_EXAMPLES          On 
+      ENABLE_EXERCISES         On 
       ======================   ======================
 
      RAJA can also be configured to build with compiler warnings reported as
-     errors, which may be useful when using RAJA in an application:
+     errors, which may be useful to make sure your application builds cleanly:
 
       =========================   ======================
       Variable                    Default
@@ -100,51 +104,57 @@ and their default settings:
       =========================   ======================
 
      RAJA Views/Layouts may be configured to check for out of bounds 
-     indexing:
+     indexing at runtime:
+
       =========================   ======================
       Variable                    Default
       =========================   ======================
       RAJA_ENABLE_BOUNDS_CHECK    Off
       =========================   ======================
+
+     Note that RAJA bounds checking is a runtime check and will add 
+     execution time overhead. Thus, this feature should not be enabled 
+     for release builds.
      
 * **Programming model back-ends**
 
      Variables that control which RAJA programming model back-ends are enabled
      are (names are descriptive of what they enable):
 
-      ======================   ======================
+      ======================   ============================================
       Variable                 Default
-      ======================   ======================
+      ======================   ============================================
       ENABLE_OPENMP            On 
-      ENABLE_TARGET_OPENMP     Off 
-      ENABLE_CUDA              Off 
+      ENABLE_TARGET_OPENMP     Off (when on, ENABLE_OPENMP must also be on)
       ENABLE_TBB               Off 
-      ======================   ======================
+      ENABLE_CUDA              Off 
+      ENABLE_HIP               Off 
+      ======================   ============================================
 
      Other compilation options are available via the following:
 
-      ======================   ======================
+      ======================   ==========================================
       Variable                 Default
-      ======================   ======================
-      ENABLE_CLANG_CUDA        Off
+      ======================   ==========================================
+      ENABLE_CLANG_CUDA        Off (when on, ENABLE_CUDA must also be on)
       ENABLE_CUB               On (when CUDA enabled)
-      ======================   ======================
+      CUDA_ARCH                sm_35 (set based on hardware support)
+      ======================   ==========================================
 
       Turning the 'ENABLE_CLANG_CUDA' variable on will build CUDA code with
-      the native support in the Clang compiler. When using it, the 
-      'ENABLE_CUDA' variable must also be turned on.
+      the native support in the Clang compiler.
 
       The 'ENABLE_CUB' variable is used to enable NVIDIA CUB library support
       for RAJA CUDA scans. Since the CUB library is included in RAJA as a
-      Git submodule, users should not have to set this in most scenarios.
+      Git submodule, users should not have to set this in most cases.
 
-.. note:: See :ref:`configopt-raja-backends-label` for more information about
-          setting compiler flags and other options for RAJA back-ends.
+.. note:: See :ref:`getting-started-label` for more information about
+          setting other options for RAJA back-ends.
 
 * **Data types, sizes, alignment, etc.**
 
      RAJA provides type aliases that can be used to parameterize floating 
-     point types in applications, which makes it easy to switch between types. 
+     point types in applications, which makes it easier to switch between types.
 
      The following variables are used to set the data type for the type
      alias ``RAJA::Real_type``:
@@ -210,32 +220,20 @@ and their default settings:
                                       attributes in a typedef.
       =============================   ========================================
 
-     RAJA internally uses parameters to define platform-specific constants 
-     for index ranges and data alignment. The variables that control these
-     are:
+     RAJA internally uses a parameter to define platform-specific constant
+     data alignment. The variable that control this is:
 
       =============================   ======================
       Variable                        Default
       =============================   ======================
-      RAJA_RANGE_ALIGN                4
-      RAJA_RANGE_MIN_LENGTH           32
       RAJA_DATA_ALIGN                 64
       =============================   ======================
 
-     What these variables mean:
+     What this variable means:
 
       =============================   ========================================
       Variable                        Meaning
       =============================   ========================================
-      RAJA_RANGE_ALIGN                Constrain alignment of begin/end indices 
-                                      of range segments generated by index set 
-                                      builder methods; i.e., begin and end 
-                                      indices of such segments will be 
-                                      multiples of this value.
-      RAJA_RANGE_MIN_LENGTH           Sets minimum length of range segments 
-                                      generated by index set builder methods.
-                                      This should be an integer multiple of 
-                                      RAJA_RANGE_ALIGN.
       RAJA_DATA_ALIGN                 Specifies data alignment used in 
                                       intrinsics and typedefs; 
                                       units of **bytes**.
@@ -250,8 +248,10 @@ and their default settings:
      example codes to determine execution timing and can be used in other apps
      as well. This timer can use any of three internal timers depending on
      your preferences, and one should be selected by setting the 'RAJA_TIMER'
-     variable. If the 'RAJA_CALIPER' variable is turned on (off by default), 
-     the timer will also offer caliper-based region annotations.
+     variable. If the 'RAJA_USE_CALIPER' variable is turned on (off by default),
+     the timer will also offer Caliper-based region annotations. Information
+     about using Caliper can be found at 
+     `Caliper <https://github.com/LLNL/Caliper>`_ 
 
       ======================   ======================
       Variable                 Values
@@ -299,72 +299,7 @@ and their default settings:
 Setting RAJA Back-End Features
 ===============================
 
-To access compiler and hardware optimization features, it is often necessary
-to pass options to a compiler. This sections describes how to do this and
-which CMake variables to use for certain cases. 
-
-* **OpenMP Compiler Options**
-
-The variable `OpenMP_CXX_FLAGS` is used to pass OpenMP-related flags to a
-compiler. Option syntax follows the CMake *list* pattern. Here is an example
-showing how to specify OpenMP target back-end options for NVIDIA GPUs using 
-the clang compiler as a CMake option::
-
-   cmake \
-     ....
-     -DOpenMP_CXX_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda" 
-     ....
-
-* **CUDA Compiler Options**
-
-When using the NVIDIA nvcc compiler for RAJA CUDA functionality, the variables:
-
-  * CMAKE_CUDA_FLAGS_RELEASE 
-  * CMAKE_CUDA_FLAGS_DEBUG
-  * CMAKE_CUDA_FLAGS_RELWITHDEBINFO 
-
-which corresponding to the standard CMake build types are used to pass flags 
-to nvcc.
-
-.. note:: When nvcc must pass options to the host compiler, the arguments
-          can be included in these CMake variables. Each host compiler
-          option must be prepended with the `-Xcompiler` directive.
-
-To set the CUDA architecture level for the nvcc compiler, which should be 
-chosen based on the NVIDIA GPU hardware you are using, you can use the 
-`CUDA_ARCH` CMake variable. For example, the CMake option::
-
-  -DCUDA_ARCH=sm_60
-
-will tell the compiler to use the `sm_60` SASS architecture in its second
-stage of compilation. It will pick the PTX architecture to use in the first
-stage of compilation that is suitable for the SASS architecture you specify.
-
-Alternatively, you may specify the PTX and SASS architectures, using 
-appropriate nvcc options in the `CMAKE_CUDA_FLAGS_*` variables.
-
-.. note:: **RAJA requires a minimum CUDA architecture level of `sm_35` to use 
-          all supported CUDA features.** Mostly, the architecture level affects 
-          which RAJA CUDA atomic operations are available and how they are 
-          implemented inside RAJA. This is described in :ref:`atomics-label`.
-
-          * If you do not specify a value for `CUDA_ARCH`, it will be set to
-            `sm_35` and CMake will emit a status message indicatting this is
-            the case.
-
-          * If you give a `CUDA_ARCH` value less than `sm_35` (e.g., `sm_30`),
-            CMake will report this and stop processing. 
-
-
-.. _configopt-raja-hostconfig-label:
-
-=======================================
-RAJA Example Build Configuration Files
-=======================================
-
-The ``RAJA/scripts`` directory contains subdirectories with a variety of 
-build scripts we use to build and test RAJA on various platforms with 
-various compilers. These scripts pass files (*CMake cache files*) in 
-the ``RAJA/host-configs`` directory to CMake using the '-C' option. 
-These files serve as useful examples of how to configure RAJA prior to
-compilation.
+Various `ENABLE_*` options are listed above for enabling RAJA back-ends,
+such as OpenMP and CUDA. To access compiler and hardware optimization features,
+it may be necessary to pass additional options to CMake. Please see
+:ref:`getting_started-label` for more information. 
diff --git a/docs/sphinx/user_guide/contributing.rst b/docs/sphinx/user_guide/contributing.rst
index fb511cad6d..a97d2123f5 100644
--- a/docs/sphinx/user_guide/contributing.rst
+++ b/docs/sphinx/user_guide/contributing.rst
@@ -12,9 +12,11 @@
 Contributing to RAJA
 ====================
 
-This section is intended for folks who want to contribute new features or
-bugfixes to RAJA. It assumes you are familiar with Git and GitHub. It
-describes what a good pull request (PR) looks like, and the tests that your
+RAJA is a collaborative open source software project and it embraces
+contributions from others who want to add features or improve existing
+features. This section is intended for folks who want to contribute new 
+features or bugfixes to RAJA. It assumes you are familiar with Git and GitHub. 
+It describes what a good pull request (PR) looks like, and the tests that your
 PR must pass before it can be merged into RAJA.
 
 ------------
@@ -22,8 +24,10 @@ Forking RAJA
 ------------
 
 If you aren't a RAJA developer at LLNL, then you won't have permission to push
-new branches to the repository. First, you should create a `fork of the repo
-<https://github.com/LLNL/RAJA#fork-destination-box>`_. This will create a copy
+new branches to the repository. This is due to the policy adopted by the LLNL
+organization on GitHub in which the RAJA project resides. Fortunately, you may 
+still contribute to RAJA by `forking the RAJA repo 
+<https://github.com/LLNL/RAJA/fork>`_. This will create a copy
 of the RAJA repository that you own, and will ensure you can push your changes
 to GitHub and create pull requests.
 
@@ -33,7 +37,7 @@ Developing a New Feature
 
 New features should be based on the RAJA ``develop`` branch. When you want to 
 create a new feature, first ensure you have an up-to-date copy of the 
-``develop`` branch:
+``develop`` branch locally:
 
 .. code-block:: bash
 
@@ -48,11 +52,12 @@ Then, create a new branch to develop your feature on:
 
 Proceed to develop your feature on this branch pushing changes with 
 reasonably-sized atomic commits, and add tests that will exercise your new 
-code. If you are creating new methods or classes, please 
-add Doxygen documentation.
+code. If you are creating new functionality, please add documentation to
+the `RAJA User Guide <https://readthedocs.org/projects/raja/>`_.
 
 Once your feature is complete and your tests are passing, you can push your
-branch to GitHub and create a PR.
+branch to GitHub and create a PR. It will be reviewed by members of the 
+core RAJA team, who will provide comments, suggestions, etc.
 
 --------------------
 Developing a Bug Fix
@@ -62,7 +67,7 @@ First, check if the change you want to make has been addressed in the RAJA
 ``develop`` branch. If so, we suggest you either start using the ``develop`` 
 branch, or temporarily apply the fix to whichever version of RAJA you are using.
 
-Assuming there is an unsolved bug, first make sure you have an up-to-date copy
+If there is an unresolved bug, first make sure you have an up-to-date copy
 of the ``develop`` branch:
 
 .. code-block:: bash
@@ -86,16 +91,20 @@ Once you are finished, you can push your branch to GitHub, then create a PR.
 Creating a Pull Request
 -----------------------
 
-You can create a new PR `here <https://github.com/LLNL/RAJA/compare>`_. GitHub
-has a good `guide <https://help.github.com/articles/about-pull-requests/>`_ on
+You can create a pull request (PR) 
+`here <https://github.com/LLNL/RAJA/compare>`_. GitHub has a good 
+`PR guide <https://help.github.com/articles/about-pull-requests/>`_ on
 PR basics if you want more information. Ensure that your PR base is the
 ``develop`` branch of RAJA.
 
-Add a descriptive title explaining the bug you fixed or the feature you have
-added, and put a longer description of the changes you have made in the comment
-box.
+When you create a RAJA PR, you must enter basic information about the 
+contents of the PR and what it does in the PR summary. Add a descriptive title 
+explaining the bug you fixed or the feature you have added, and put a longer 
+description of the changes you have made in the comment box. This will help
+reviewers understand your intent and provide a more useful review of your
+work.
 
-Once your PR has been created, it will be run through our automated tests and
+After your PR has been created, it will be run through our automated tests and
 also be reviewed by RAJA team members. Providing the branch passes both the
 tests and reviews, it will be merged into RAJA.
 
@@ -111,5 +120,11 @@ is used on a wide variety of systems with a number of configurations, and
 adding new tests helps ensure that all features work as expected across these
 environments.
 
-All RAJA tests are in the ``RAJA/test`` directory and are split up by 
-programming model back-end and feature.
+All RAJA tests are in the ``RAJA/test`` directory and are split into 
+*unit tests* and *functional tests*. Unit tests are intended to test basic
+interfaces and features of individual classes, methods, etc. Functional tests
+are used to test combinations of RAJA features. Please follow the implementation
+pattern of existing tests. We have organized our tests to make it easy to see
+what is being tested and easy to add new tests, for a new programming model
+back-end, for example. 
+
diff --git a/docs/sphinx/user_guide/developer_guide.rst b/docs/sphinx/user_guide/developer_guide.rst
new file mode 100644
index 0000000000..2d1ecd1e59
--- /dev/null
+++ b/docs/sphinx/user_guide/developer_guide.rst
@@ -0,0 +1,74 @@
+.. developer_guide:
+
+===============
+Developer Guide
+===============
+
+Generating RAJA host-config files
+===================================
+
+.. note::
+  This is optional if you are on LC machines, since some host-config files have already been generated (at least for Quartz and Lassen) and can be found in the ``host-configs`` repository directory.
+
+RAJA only directly depends on CMake. However, this mechanism will generate a cmake configuration file that reproduces the configuration `Spack <https://github.com/spack/spack>` would have generated in the same context. It contains all the information necessary to build RAJA with the described toolchain.
+
+In particular, the host config file will setup:
+* flags corresponding with the target required (Release, Debug).
+* compilers path, and other toolkits (cuda if required), etc.
+
+This provides an easy way to build RAJA based on `Spack <https://github.com/spack/spack>` and encapsulated in `Uberenv <https://github.com/LLNL/uberenv>`_.
+
+Uberenv role
+------------
+
+Uberenv helps by doing the following:
+
+* Pulls a blessed version of Spack locally
+* If you are on a known operating system (like TOSS3), we have defined compilers and system packages so you don't have to rebuild the world (CMake typically in RAJA).
+* Overrides RAJA Spack packages with the local one if it exists. (see ``scripts/uberenv/packages``).
+* Covers both dependencies and project build in one command.
+
+Uberenv will create a directory ``uberenv_libs`` containing a Spack instance with the required RAJA dependencies installed. It then generates a host-config file (``<config_dependent_name>.cmake``) at the root of RAJA repository.
+
+Using Uberenv to generate the host-config file
+----------------------------------------------
+
+.. code-block:: bash
+
+  $ python scripts/uberenv/uberenv.py
+
+.. note::
+  On LC machines, it is good practice to do the build step in parallel on a compute node. Here is an example command: ``srun -ppdebug -N1 --exclusive python scripts/uberenv/uberenv.py``
+
+Unless otherwise specified Spack will default to a compiler. It is recommended to specify which compiler to use: add the compiler spec to the ``--spec`` Uberenv command line option.
+
+On blessed systems, compiler specs can be found in the Spack compiler files in our repository: ``scripts/uberenv/spack_configs/<System type>/compilers.yaml``.
+
+Some examples uberenv options:
+
+* ``--spec=%clang@9.0.0``
+* ``--spec=%clang@8.0.1+cuda``
+* ``--prefix=<Path to uberenv build directory (defaults to ./uberenv_libs)>``
+
+Building dependencies can take a long time. If you already have a spack instance you would like to reuse (in supplement of the local one managed by Uberenv), you can do so changing the uberenv command as follow:
+
+.. code-block:: bash
+
+   $ python scripts/uberenv/uberenv.py --upstream=</path/to/my/spack>/opt/spack
+
+Using host-config files to build RAJA
+-------------------------------------
+
+When a host-config file exists for the desired machine and toolchain, it can easily be used in the CMake build process:
+
+If I need to build RAJA with _clang_ and _cuda_ on _lassen_, I can see there is already a host-config file named `lassen-blueos_3_ppc64le_ib_p9-clang@8.0.1-cuda.cmake`. To use it (on lassen):
+
+.. code-block:: bash
+
+  $ mkdir build && cd build
+  $ cmake -C ../host-configs/lassen-blueos_3_ppc64le_ib_p9-clang@8.0.1-cuda.cmake ..
+  $ cmake --build -j .
+  $ ctest --output-on-failure -T test
+
+.. note::
+  This will build the default configuration. Not all parameters are embedded into the host-config file. For example, producing shared/static libraries, using OppenMP, enabling tests, is to be configured on command line.
diff --git a/docs/sphinx/user_guide/feature/iteration_spaces.rst b/docs/sphinx/user_guide/feature/iteration_spaces.rst
index a4ebff7eeb..7519e52ef2 100644
--- a/docs/sphinx/user_guide/feature/iteration_spaces.rst
+++ b/docs/sphinx/user_guide/feature/iteration_spaces.rst
@@ -36,18 +36,15 @@ Just like traditional C and C++ for-loops, RAJA uses index variables to
 identify loop iterates. Any lambda expression that represents all or part of
 a loop body passed to a ``RAJA::forall`` or ``RAJA::kernel`` method will 
 take at least one loop index variable argument. RAJA iteration space types 
-and methods are templates that allow users to use any integral type for an
+are templates that allow users to use any integral type for an
 index variable. The index variable type may be explicitly specified by a user.
-RAJA also provides a ``RAJA::Index_type`` type, which is used as a default 
+RAJA also provides the ``RAJA::Index_type`` type, which is used as a default 
 in some circumstances for convenience by allowing use of a common type 
 alias to typed constructs without explicitly specifying the type. 
-The ``RAJA::Index_type`` type is an alias to the C++ type 'std::ptrdiff_t', 
+The ``RAJA::Index_type`` type is an alias to the C++ type ``std::ptrdiff_t``, 
 which is appropriate for most compilers to generate useful loop-level 
 optimizations.
 
-.. note:: Users can change the type of ``RAJA::Index_type`` by editing the RAJA
-          ``RAJA/include/RAJA/util/types.hpp`` header file.
-
 .. _segments-label:
 
 -------------
@@ -78,7 +75,7 @@ One can create an explicitly-typed range segment or one with the default
    RAJA::RangeSegment default_range(beg, end);
 
 .. note:: When using a RAJA range segment, no loop iterations will be run when
-          begin is greater-than-or-equal-to end.  
+          begin is greater-than-or-equal-to end similar to a C-style for-loop.
 
 Strided Segments
 ^^^^^^^^^^^^^^^^^^^
@@ -139,6 +136,18 @@ segment constructor. For example::
    // Create list segment with these loop indices
    RAJA::TypedListSegment<int> idx_list( &idx[0], static_cast<int>(idx.size()) );
 
+Using a list segment in a RAJA loop traversal template will run the loop 
+indices specified in the array passed to the list segment constructor. That 
+is, using 'idx_list' from above::
+
+   RAJA::forall< RAJA::seq_exec >( idx_list, [=] (RAJA::Index_type i) {
+     printf("%ld ", i);
+   } );
+
+will print the values::
+
+   0 2 3 4 7 8 9 53
+
 Similar to range segment types, RAJA provides ``RAJA::ListSegment``, which is
 a type alias to ``RAJA::TypedListSegment`` using ``RAJA::Index_type`` as the
 template type parameter.
@@ -197,16 +206,17 @@ loop execution template to execute the indices defined by its segments::
    // Run a kernel with iterates defined by the index set
    RAJA::forall<ISET_EXECPOL>(iset, [=] (int i) { ... });
 
-.. note:: Iterating over the indices of all segments in a RAJA index set 
-          requires a two-level execution policy. The outer level specifies
-          how to iterate over the seqments. The inner level specifies how
-          each segment will execute. See :ref:`indexsetpolicy-label` for
-          more information about IndexSet execution policies.
-
 In this example, the loop iterations will execute in three chunks defined by 
 the two range segments and one list segment. The segments will be iterated 
 over in parallel using OpenMP, and each segment will execute sequentially.
 
+.. note:: Iterating over the indices of all segments in a RAJA index set 
+          requires a two-level execution policy, with two template parameters,
+          as shown above. The first parameter specifies how to iterate over 
+          the seqments. The second parameter specifies how each segment will 
+          execute. See :ref:`indexsetpolicy-label` for more information about 
+          RAJA index set execution policies.
+
 .. note:: It is the responsibility of the user to ensure that segments are
           defined properly when using RAJA index sets. For example, if the
           same index appears in multiple segments, the corresponding loop
diff --git a/docs/sphinx/user_guide/feature/local_array.rst b/docs/sphinx/user_guide/feature/local_array.rst
index b6a2e55ef8..7207732b85 100644
--- a/docs/sphinx/user_guide/feature/local_array.rst
+++ b/docs/sphinx/user_guide/feature/local_array.rst
@@ -12,10 +12,10 @@
 Local Array
 ===========
 
-This section introduces RAJA local arrays. A ``RAJA::LocalArray`` is a 
-multi-dimensional array object whose memory is allocated when a RAJA kernel 
-is executed and only lives within the scope of the kernel execution. To 
-motivate the concept and usage, consider a simple C++ example
+This section introduces RAJA *local arrays*. A ``RAJA::LocalArray`` is an
+array object with one or more dimensions whose memory is allocated when a 
+RAJA kernel is executed and only lives within the scope of the kernel 
+execution. To motivate the concept and usage, consider a simple C++ example
 in which we construct and use two arrays in nested loops::
 
            for(int k = 0; k < 7; ++k) { //k loop
@@ -93,19 +93,20 @@ two-dimensional and one one-dimensional and creates an instance of each type.
 The template arguments for the ``RAJA::LocalArray`` types are:
 
   * Array data type
-  * Index permutation (see :ref:`view-label` for more on layouts and permutations)
+  * Index permutation (see :ref:`view-label` for more on RAJA permutations)
   * Array dimensions
 
 .. note:: ``RAJA::LocalArray`` types support arbitrary dimensions and sizes.
 
 The kernel policy is a two-level nested loop policy (see 
-:ref:`loop_elements-kernel-label`` for more information) with a statement type
-``RAJA::statement::InitLocalMem`` inserted between the nested for-loops which
-allocates the memory for the local arrays when the kernel executes. 
-The ``InitLocalMem`` statement type uses a 'CPU tile' memory type, for the 
-two entries '0' and '1' in the kernel parameter tuple (second argument to 
-``RAJA::kernel_param``). Then, the inner initialization loop and inner print 
-loops are run with the respective lambda bodies defined in the kernel.
+:ref:`loop_elements-kernel-label` for information about RAJA kernel policies) 
+with a statement type ``RAJA::statement::InitLocalMem`` inserted between the 
+nested for-loops which allocates the memory for the local arrays when the 
+kernel executes.  The ``InitLocalMem`` statement type uses a 'CPU tile' memory 
+type, for the two entries '0' and '1' in the kernel parameter tuple 
+(second argument to ``RAJA::kernel_param``). Then, the inner initialization 
+loop and inner print loop are run with the respective lambda bodies defined 
+in the kernel.
 
 -------------------
 Memory Policies
diff --git a/docs/sphinx/user_guide/feature/loop_basic.rst b/docs/sphinx/user_guide/feature/loop_basic.rst
index 741a7d4d66..8d62c6bd4a 100644
--- a/docs/sphinx/user_guide/feature/loop_basic.rst
+++ b/docs/sphinx/user_guide/feature/loop_basic.rst
@@ -18,11 +18,12 @@ RAJA interface for loop execution. ``RAJA::forall`` methods execute simple
 loops (e.g., non-nested loops) while ``RAJA::kernel`` methods support nested 
 loops and other complex loop kernels and transformations.
 
-.. note:: * All **forall** and **kernel** methods are in the namespace ``RAJA``.
+.. note:: * All ``forall`` and ``kernel`` methods are in the namespace ``RAJA``.
           * A ``RAJA::forall`` loop execution method is a template on an 
             *execution policy* type. A ``RAJA::forall`` method takes two 
             arguments: 
-              * an iteration space object, and
+              * an iteration space object, such as a contiguous range of loop
+                indices, and
               * a lambda expression representing the loop body.
           * Each ``RAJA::kernel`` method is a template on a policy that 
             contains statements with *execution policy* types appropriate for 
@@ -45,8 +46,8 @@ Simple Loops (RAJA::forall)
 ---------------------------
 
 As noted earlier, a ``RAJA::forall`` template executes simple 
-(e.g., non-nested) loops. For example, a C-style loop that adds two vectors,
-like::
+(i.e., non-nested) loops. For example, a C-style loop that adds two vectors,
+like this::
 
   for (int i = 0; i < N; ++i) {
     c[i] = a[i] + b[i];
@@ -67,19 +68,20 @@ objects enable the loop iterates to be partitioned, reordered, run in
 different threads, etc. 
 
 .. note:: Changing loop execution policy types and iteration space constructs
-          enable loops to run in different ways by recompiling the code and 
+          enables loops to run in different ways by recompiling the code and 
           without modifying the loop kernel code.
 
 While loop execution using ``RAJA::forall`` methods is a subset of 
 ``RAJA::kernel`` functionality, described next, we maintain the 
 ``RAJA::forall`` interface for simple loop execution because the syntax is 
-simpler and less verbose.
+simpler and less verbose for that use case.
 
 .. note:: Data arrays in lambda expressions used with RAJA are typically 
           RAJA Views (see :ref:`view-label`) or bare pointers as shown in
           the code snippets above. Using something like 'std::vector' is
-          non-portable (won't work in CUDA kernels) and would add excessive 
-          overhead for copying data into the lambda data environment.
+          non-portable (won't work in GPU kernels, generally) and would add 
+          excessive overhead for copying data into the lambda data environment
+          when captured by value.
 
 .. _loop_elements-kernel-label:
 
@@ -99,7 +101,7 @@ consider a (N+1)-level C-style loop nest::
   }
 
 Note that we could write this by nesting ``RAJA::forall`` statements and
-it would work, assuming the execution policies were chosen properly::
+it would work for some execution policy choices::
 
   RAJA::forall<exec_policyN>(IN, [=] (int iN) {
     ...
@@ -111,19 +113,22 @@ it would work, assuming the execution policies were chosen properly::
 
 However, this approach treats each loop level as an independent entity. This
 makes it difficult to parallelize the levels in the loop nest together. So it
-limits the amount of parallelism that can be exposed and the types of 
+may limit the amount of parallelism that can be exposed and the types of 
 parallelism that may be used. For example, if an OpenMP or CUDA
 parallel execution policy is used on the outermost loop, then all inner loops
 would be run sequentially in each thread. It also makes it difficult to perform 
-transformations like loop interchange and loop collapse. 
+transformations like loop interchange and loop collapse without changing the 
+source code, which breaks RAJA encapsulation.
 
-The RAJA *kernel* interface facilitates parallel execution and transformations 
-of arbitrary loop nests and other complex loops. It can treat a complex loop 
-structure as a single entity, which simplifies the ability to apply kernel
-transformations and different parallel execution patterns by changing one 
-execution policy type.
+.. note:: **We do not recommend nesting ``RAJA::forall`` statements.**
 
-The loop nest may be written using the RAJA kernel interface as::
+The RAJA *kernel* interface facilitates parallel execution and compile-time
+transformation of arbitrary loop nests and other complex loop structures. 
+It can treat a complex loop structure as a single entity, which simplifies 
+the ability to transform and apply different parallel execution patterns by 
+changing the execution policy type and *not the kernel code*.
+
+The loop above nest may be written using the RAJA kernel interface as::
 
     using KERNEL_POL = 
       RAJA::KernelPolicy< RAJA::statement::For<N, exec_policyN, 
@@ -150,17 +155,17 @@ expressions as arguments.
 
 In the case we discuss here, the execution policy contains a nested sequence
 of ``RAJA::statement::For`` statements, one for each level in the loop nest. 
-Each 'For' statement takes three template parameters: 
+Each ``For`` statement takes three template parameters: 
 
-  * an integral index parameter that binds it to the item in the iteration 
-    space tuple associated with that index,
-  * an execution policy type for the corresponding loop nest level, and
+  * an integral index parameter that binds the ``For`` statement to the item 
+    in the iteration space tuple corresponding to that index,
+  * an execution policy type for the associated loop nest level, and
   * an *enclosed statement list* (described in :ref:`loop_elements-kernelpol-label`).
 
 .. note:: The nesting of ``RAJA::statement::For`` types is analogous to the
           nesting of for-statements in the C-style version of the loop nest.
-          A notable syntactic difference is that curly braces are replaced 
-          with '<, >' symbols enclosing the template parameter lists.
+          One can think of the '<, >' symbols enclosing the template parameter 
+          lists as being similar to the curly braces in C-style code.
 
 Here, the innermost type in the kernel policy is a 
 ``RAJA::statement::Lambda<0>`` type indicating that the first lambda expression
@@ -175,11 +180,15 @@ enables non-perfectly nested loops.
 RAJA offers two types of lambda statements. The first as illustratated
 above, requires that each lambda expression passed to a ``RAJA::kernel`` method
 **must take an index argument for each iteration space in the tuple**.
-However, any subset of the arguments may actually be used in each lambda expression.
+With this type of lambda statement, the entire iteration space must be active 
+in a containing ``For`` construct.  A compile time ``static_assert`` will be 
+triggered if any of the arguments are undefined, indicating that something
+is not correct.
 
 The second type of lambda statement, an extension of the first, takes additional
-template parameters which are used to specify lambda arguments. This results in
-kernel lambdas only requiring arguments which will be used within the body.
+template parameters which specify which iteration space indices are passed
+as lambda arguments. The result is that a kernel lambda only needs to accept
+iteration space index arguments that are used in the lambda body.
 
 The kernel policy list with lambda arguments may be written as::
 
@@ -187,29 +196,30 @@ The kernel policy list with lambda arguments may be written as::
       RAJA::KernelPolicy< RAJA::statement::For<N, exec_policyN, 
                             ...
                               RAJA::statement::For<0, exec_policy0,
-                                RAJA::statement::Lambda<0, RAJA::statement::Segs<N,...,0>>
+                                RAJA::statement::Lambda<0, RAJA::Segs<N,...,0>>
                               >
                             ...
                           > 
                         >;
 
-The template parameter ``RAJA::statement::Segs`` is used to identify elements from the
-segment tuple to be used as arguments for a lambda. RAJA offers other statements
-such as ``Offsets``, and ``Params`` to identify offsets and parameters in segments and 
-param tuples respectively to be used as lambda argumentsx. See :ref:`matrixmultiply-label`
-and :ref:`matrixtransposelocalarray-label` for detailed  examples.
-
+The template parameter ``RAJA::Segs`` is used to specify which elements in the
+segment tuple are used to pass arguments to a lambda. RAJA offers other 
+types such as ``RAJA::Offsets``, and ``RAJA::Params`` to identify offsets and 
+parameters in segments and param tuples respectively to be used as lambda 
+argumentsx. See :ref:`matrixmultiply-label` and 
+:ref:`matrixtransposelocalarray-label` for detailed  examples.
 
-.. note:: Unless lambda arguments are specified through RAJA lambda statements,
+.. note:: Unless lambda arguments are specified in RAJA lambda statements,
           the loop index arguments for each lambda expression used in a RAJA
           kernel loop body **must match** the contents of the 
           *iteration space tuple* in number, order, and type. Not all index 
-          arguments must be used in each lambda, but they **all must appear** 
-          for the RAJA kernel to be well-formed. In particular, your code will 
-          not compile if this is not done correctly. If an argument is unused
-          in a lambda expression, you may include its type and omit its name
-          in the argument list to avoid compiler warnings just as one would do
-          for a regular C++ method.
+          arguments must be used in a lambda, but they **all must appear** 
+          in the lambda argument list and **all must be in active loops** to be 
+          well-formed. In particular, your code will not compile if this is 
+          not done correctly. If an argument is unused in a lambda expression, 
+          you may include its type and omit its name in the argument list to 
+          avoid compiler warnings just as one would do for a regular C++ 
+          method with unused arguments.
 
 For RAJA nested loops implemented with ``RAJA::kernel``, as shown here, the 
 loop nest ordering is determined by the order of the nested policies, starting 
@@ -227,7 +237,11 @@ See :ref:`matmultkernel-label` for a complete example showing RAJA nested
 loop functionality and :ref:`nestedreorder-label` for a detailed example 
 describing nested loop reordering.
 
-A summary of all RAJA execution policies that may be used with ``RAJA::forall``
-or ``RAJA::kernel`` may be found in :ref:`policies-label`. Also, a discussion
-of how to construct ``RAJA::KernelPolicy`` types and available 
-``RAJA::statement`` types can be found in :ref:`loop_elements-kernelpol-label`.
+.. note:: In general, RAJA execution policies for ``RAJA::forall`` and 
+          ``RAJA::kernel`` are different. A summary of all RAJA execution 
+          policies that may be used with ``RAJA::forall`` or ``RAJA::kernel`` 
+          may be found in :ref:`policies-label`. 
+
+Finally, a discussion of how to construct ``RAJA::KernelPolicy`` types and 
+available ``RAJA::statement`` types can be found in 
+:ref:`loop_elements-kernelpol-label`.
diff --git a/docs/sphinx/user_guide/feature/plugins.rst b/docs/sphinx/user_guide/feature/plugins.rst
new file mode 100644
index 0000000000..5592ec83fe
--- /dev/null
+++ b/docs/sphinx/user_guide/feature/plugins.rst
@@ -0,0 +1,129 @@
+.. ##
+.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+.. ## and other RAJA project contributors. See the RAJA/COPYRIGHT file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _plugins-label:
+
+========
+Plugins
+========
+
+------------
+About RAJA Plugins
+------------
+
+RAJA supports user-made plugins that may be loaded either at the time of compilation or during runtime. These two methods are not mutually exclusive, as plugins loaded statically can be run alongside plugins that are loaded dynamically.
+
+------------
+Using RAJA Plugins
+------------
+
+^^^^^^^^^^^^^^^^^
+Static vs Dynamic Loading
+^^^^^^^^^^^^^^^^^
+
+**Static loading** is done at compile time and requires recompilation in order to add, remove, or change a plugin. This is arguably the easier method to implement, requiring only simple file linking to make work. However, recompilation may get tedious and resource-heavy when working with many plugins or on large projects. In these cases, it may be a better idea to load plugins dynamically, requiring no recompilation of the project most of the time.
+
+**Dynamic loading** is done at runtime and only requires the recompilation or moving of plugin files in order to add, remove, or change a plugin. This will likely require more work to set up, but in the long run may save time and resources. RAJA will look at the environment variable ``RAJA_PLUGINS`` for a path to a plugin or plugin directory, and automatically load them at runtime. This means that a plugin can be added to a project as easily as making a shared object file and setting ``RAJA_PLUGINS`` to the appropriate path.
+
+^^^^^^^^^^^
+Quick Start Guide
+^^^^^^^^^^^
+
+**Static**
+
+1. Build RAJA normally.
+
+2. Either use an ``#include`` statement within the code or compiler flags to load your plugin file with your project at compile time. A brief example of this would be something like ``g++ project.cpp plugin.cpp -lRAJA -fopenmp -ldl -o project``.
+
+3. When you run your project, your plugin should work!
+
+**Dynamic**
+
+1. Build RAJA normally.
+
+2. Compile your plugin to be shared object files with a .so extension. A brief example of this wouldbe something like ``g++ plugin.cpp -lRAJA -fopenmp -fPIC -shared -o plugin.so``.
+
+3. Set the environment variable ``RAJA_PLUGINS`` to be the path of your .so file. This can either be the path to its directory or to the shared object file itself. If the path is to a directory, it will attempt to load all .so files in that directory.
+
+4. When you run your project, your plugins should work!
+
+^^^^^^^^^^^
+Interfacing with Plugins
+^^^^^^^^^^^
+The RAJA Plugin API allows for limited interfacing between a project and a plugin. There are, however a couple functions that allow for this to take place. ``init_plugins`` and ``finalize_plugins``. Using one of these will call the corresponding ``init`` or ``finalize`` function inside of *every* currently loaded plugin. It's worth noting that plugins don't require either an init or finalize function by default.
+
+* ``RAJA::util::init_plugins();`` - Will call the ``init`` function of every currently loaded plugin.
+
+* ``RAJA::util::init_plugins("path/to/plugins");`` - Does the same as the above call to init_plugins, but will also dynamically load plugins located at the path specified.
+
+* ``RAJA::util::finalize_plugins();`` - Will call the ``finalize`` function of every currently loaded plugin. 
+
+
+------------
+Creating Plugins For RAJA
+------------
+
+Plugins take advantage of *polymorphism*, using ``RAJA::util::PluginStrategy`` as the parent and implementing the required functions for the API. An example implementation can be found at the bottom of this page.
+
+^^^^^^^^^^^
+Functions
+^^^^^^^^^^^
+The preLaunch and postLaunch functions are automatically called by RAJA before and after loop execution. This applies to RAJA's kernel and forall implementations.
+
+* ``void init(const PluginOptions& p) override {}`` - runs on all plugins when the user makes a call to ``init_plugins``
+
+* ``void preCapture(const PluginContext& p) override {}`` - Will occur before capture of kernel/forall.
+
+* ``void postCapture(const PluginContext& p) override {}`` - Will occur after capture of kernel/forall.
+
+* ``void preLaunch(const PluginContext& p) override {}`` - Will occur before kernel/forall execution.
+
+* ``void postLaunch(const PluginContext& p) override {}`` - Will occur after kernel/forall execution.
+
+* ``void finalize() override {}`` - Runs on all plugins when the user makes a call to ``finalize_plugins``. This will also unload all currently loaded plugins.
+
+Init and finalize are never run by RAJA by default and are only run when the user makes a call to RAJA::util::init_plugin() or RAJA::util::finalize_plugin() respectively.
+
+^^^^^^^^^^^^^^^^^
+Static Loading
+^^^^^^^^^^^^^^^^^
+If the plugin is to be loaded into a project at compile time, adding the following one-liner will add the plugin to the RAJA PluginRegistry and will be loaded every time the compiled executable is run. This requires the plugin to be loaded with either in an ``#include`` statement within the project or by compiler commands.
+::
+
+  static RAJA::util::PluginRegistry::add<PluginName> P("Name", "Description");
+
+
+^^^^^^^^^^^^^^^^^
+Dynamic Loading
+^^^^^^^^^^^^^^^^^
+If the plugin is to be dynamically loaded to a project during runtime, the RAJA Plugin API requires a few conditions to be met. The following must be true about the plugin, not necessarily of the project using it.
+
+1. **The plugin must have the following factory function.** This will return a pointer to an instance of your plugin. Thanks to the ``extern "C"`` a project will be able to search for "getPlugin" within the dynamically loaded plugin correctly.
+::
+
+  extern "C" RAJA::util::PluginStrategy *getPlugin ()
+  {
+    return new MyPluginName;
+  }
+  
+
+2. **The plugin must be compiled to be a shared object with a .so extension.** A simple example containing required flags would be: ``g++ plugin.cpp -lRAJA -fopenmp -fPIC -shared -o plugin.so``. At the moment, RAJA will only attempt to load files with .so extensions. It's worth noting why these flags (or their equivalents) are important. ``-lRAJA -fopenmp`` are the standard flags for compiling the RAJA library. For the purposes of dynamic loading, ``-fPIC`` tells the compiler to produce *position independent code*, which is needed to prevent conflicts in the address space of the executable. ``-shared`` will let the compiler know that you want the resulting object file to be shared, removing the need for a *main* as well as giving dynamically loaded executables access to functions flagged with ``extern "C"``.
+
+3. **The** ``RAJA_PLUGINS`` **environment variable has been set**, or the user has made a call to ``RAJA::util::init_plugins("path");`` with a path specified to either a directory or a .so file. It's worth noting that these are not mutually exclusive, RAJA will look for plugins from the environment variable on program startup and new plugins may be loaded after that using ``init_plugins``.
+
+
+^^^^^^^^^^^^^^^^^
+Example Implementation
+^^^^^^^^^^^^^^^^^
+
+The following is an example plugin that simply will print out the number of times a kernel has been launched and has the ability to be loaded either statically or dynamically.
+
+.. literalinclude:: ../../../../examples/plugin/counter-plugin.cpp
+   :start-after: _plugin_example_start
+   :end-before: _plugin_example_end
+   :language: C++
diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index bbf0166f6b..cd75aa0904 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -13,7 +13,7 @@ Policies
 ==================
 
 This section describes various RAJA policies for loop kernel execution,
-scans, reductions, atomics, etc. Each policy is a type that is passed to
+scans, sorts, reductions, atomics, etc. Each policy is a type that is passed to
 a RAJA template method or class to specialize its behavior. Typically, the
 policy indicates which programming model back-end to use and sometimes
 provides additional information about the execution pattern, such as
@@ -38,15 +38,16 @@ caveats.
  ====================================== ============= ==========================
  seq_exec                               forall,       Strictly sequential
                                         kernel (For), execution
-                                        scan
+                                        scan,
+                                        sort
  simd_exec                              forall,       Try to force generation of
                                         kernel (For), SIMD instructions via
                                         scan          compiler hints in RAJA
                                                       internal implementation
  loop_exec                              forall,       Allow compiler to generate
                                         kernel (For), any optimizations, such as
-                                        scan          SIMD, that may be
-                                                      beneficial according to
+                                        scan,         SIMD, that may be
+                                        sort          beneficial according to
                                                       its heuristics;
                                                       i.e., no loop decorations
                                                       (pragmas or intrinsics) in
@@ -57,29 +58,47 @@ caveats.
  OpenMP CPU Multithreading Policies     Works with    Brief description
  ====================================== ============= ==========================
  omp_parallel_for_exec                  forall,       Create OpenMP parallel
-                                        kernel (For), region and execute with 
-                                        scan          CPU multithreading inside
-                                                      it; i.e., apply ``omp 
-                                                      parallel for`` pragma 
+                                        kernel (For), region and execute with
+                                        scan,         CPU multithreading inside
+                                        sort          it; i.e., apply ``omp
+                                                      parallel for`` pragma
  omp_for_exec                           forall,       Parallel execution with
-                                        kernel (For)  OpenMP CPU multithreading
-                                                      inside an *existing* 
-                                                      parallel region; i.e., 
+                                        kernel (For), OpenMP CPU multithreading
+                                        scan          inside an *existing* 
+                                                      parallel region (see 
+                                                      comments below); i.e., 
                                                       apply ``omp for`` pragma 
  omp_for_static<CHUNK_SIZE>             forall,       Execute loop with OpenMP
-                                        kernel (For)  CPU multithreading using
-                                                      static schedule and given
+                                        kernel (For), CPU multithreading using
+                                        scan          static schedule and given
                                                       chunk size inside an 
                                                       *existing* parallel 
-                                                      region; i.e., apply ``omp                                                       for schedule(static, 
+                                                      region (see comments 
+                                                      below); i.e., apply ``omp                                                       for schedule(static, 
                                                       CHUNK_SIZE)`` pragma
  omp_for_nowait_exec                    forall,       Parallel execution with
-                                        kernel (For)  OpenMP CPU multithreading
-                                                      inside an existing 
-                                                      parallel region without
+                                        kernel (For), OpenMP CPU multithreading
+                                        scan          inside an *existing* 
+                                                      parallel region (see 
+                                                      comments below) without
                                                       synchronization after 
                                                       loop; i.e., apply
                                                       ``omp for nowait`` pragma
+ omp_for_schedule_exec<Sched>           forall,       Parallel execution with
+                                        kernel (For)  OpenMP CPU multithreading
+                                                      inside an *existing* 
+                                                      parallel region (see 
+                                                      comments below) with a
+                                                      specified schedule (*Sched*)
+ omp_for_nowait_schedule_exec<Sched>    forall,       Parallel execution with
+                                        kernel (For)  OpenMP CPU multithreading
+                                                      inside an *existing* 
+                                                      parallel region (see 
+                                                      comments below) with a
+                                                      specified schedule (*Sched*)
+                                                      and without synchronization
+                                                      after loop; e.g., append
+                                                      ``nowait`` to pragma
  ====================================== ============= ==========================
 
  ====================================== ============= ==========================
@@ -87,22 +106,22 @@ caveats.
  ====================================== ============= ==========================
  tbb_for_exec                           forall,       Execute loop iterations
                                         kernel (For), as tasks in parallel using
-                                        scan          TBB ``parallel_for`` 
+                                        scan          TBB ``parallel_for``
                                                       method
  tbb_for_static<CHUNK_SIZE>             forall,       Same as above, but use
                                         kernel (For), a static scheduler with
                                         scan          given chunk size
  tbb_for_dynamic                        forall,       Same as above, but use
                                         kernel (For), a dynamic scheduler
-                                        scan  
+                                        scan
  ====================================== ============= ==========================
 
  ====================================== ============= ==========================
  CUDA Execution Policies                Works with    Brief description
  ====================================== ============= ==========================
  cuda_exec<BLOCK_SIZE>                  forall,       Execute loop iterations
-                                        kernel (For), in a CUDA kernel launched
-                                        scan          with given thread-block
+                                        scan,         in a CUDA kernel launched
+                                        sort          with given thread-block
                                                       size. If block size not
                                                       given, the default value 
                                                       of 256 threads/block is 
@@ -110,7 +129,7 @@ caveats.
  cuda_thread_x_direct                   kernel (For)  Map loop iterates
                                                       directly to CUDA threads
                                                       in x-dimension, one
-                                                      iterate per thread 
+                                                      iterate per thread
                                                       (see note below about
                                                       limitations)
  cuda_thread_y_direct                   kernel (For)  Same as above, but map
@@ -118,19 +137,19 @@ caveats.
  cuda_thread_z_direct                   kernel (For)  Same as above, but map
                                                       to threads in z-dimension
  cuda_thread_x_loop                     kernel (For)  Similar to thread-x-direct
-                                                      policy, but use a 
+                                                      policy, but use a
                                                       block-stride loop which
-                                                      doesn't limit number of 
+                                                      doesn't limit number of
                                                       loop iterates
  cuda_thread_y_loop                     kernel (For)  Same as above, but for
                                                       threads in y-dimension
  cuda_thread_z_loop                     kernel (For)  Same as above, but for
                                                       threads in z-dimension
- cuda_block_x_direct                    kernel (For)  Map loop iterates 
-                                                      directly to CUDA thread 
+ cuda_block_x_direct                    kernel (For)  Map loop iterates
+                                                      directly to CUDA thread
                                                       blocks in x-dimension,
                                                       one iterate per block
- cuda_block_y_direct                    kernel (For)  Same as above, but map 
+ cuda_block_y_direct                    kernel (For)  Same as above, but map
                                                       to blocks in y-dimension
  cuda_block_z_direct                    kernel (For)  Same as above, but map
                                                       to blocks in z-dimension
@@ -143,15 +162,15 @@ caveats.
                                                       blocks in y-dimension
  cuda_block_z_loop                      kernel (For)  Same as above, but use
                                                       blocks in z-dimension
- cuda_warp_direct                       kernel (For)  Map work to threads 
+ cuda_warp_direct                       kernel (For)  Map work to threads
                                                       in a warp directly.
                                                       Cannot be used in
                                                       conjunction with
                                                       cuda_thread_x_* policies.
                                                       Multiple warps can be
                                                       created by using
-                                                      cuda_thread_y/z_* 
-                                                      policies. 
+                                                      cuda_thread_y/z_*
+                                                      policies.
  cuda_warp_loop                         kernel (For)  Policy to map work to
                                                       threads in a warp
                                                       using a warp-stride loop.
@@ -161,9 +180,9 @@ caveats.
                                                       Multiple warps can be
                                                       created by using
                                                       cuda_thread_y/z_*
-                                                      policies. 
- cuda_warp_mask_direct<BitMask<..>>     kernel (For)  Policy to map work 
-                                                      directly to threads in a 
+                                                      policies.
+ cuda_warp_mask_direct<BitMask<..>>     kernel (For)  Policy to map work
+                                                      directly to threads in a
                                                       warp using a bit mask.
                                                       Cannot be used in
                                                       conjunction with
@@ -194,25 +213,25 @@ caveats.
  ====================================== ============= ==========================
  OpenMP Target Execution Policies       Works with    Brief description
  ====================================== ============= ==========================
- omp_target_parallel_for_exec<#>        forall        Create parallel target 
-                                                      region and execute with 
-                                                      given number of threads  
+ omp_target_parallel_for_exec<#>        forall        Create parallel target
+                                                      region and execute with
+                                                      given number of threads
                                                       per team inside it. Number
                                                       of teams is calculated
                                                       internally; i.e.,
-                                                      apply ``omp teams 
-                                                      distribute parallel for 
+                                                      apply ``omp teams
+                                                      distribute parallel for
                                                       num_teams(iteration space
                                                       size/#)
                                                       thread_limit(#)`` pragma
- omp_target_parallel_collapse_exec      kernel        Similar to above, but 
-                                        (Collapse)    collapse 
+ omp_target_parallel_collapse_exec      kernel        Similar to above, but
+                                        (Collapse)    collapse
                                                       *perfectly-nested*
-                                                      loops, indicated in 
+                                                      loops, indicated in
                                                       arguments to RAJA
                                                       Collapse statement. Note:
                                                       compiler determines number
-                                                      of thread teams and 
+                                                      of thread teams and
                                                       threads per team
  ====================================== ============= ==========================
 
@@ -220,10 +239,53 @@ The following notes provide additional information about policy usage.
 
 .. note:: To control the number of threads used by OpenMP policies
           set the value of the environment variable 'OMP_NUM_THREADS' (which is
-          fixed for duration of run), or call the OpenMP routine 
-          'omp_set_num_threads(nthreads)' (which allows changing number of 
+          fixed for duration of run), or call the OpenMP routine
+          'omp_set_num_threads(nthreads)' (which allows changing number of
           threads at runtime).
 
+.. note:: As noted above, some OpenMP policies must only be used within an
+          **existing** parallel region to work the way you would expect them
+          to. For example::
+
+            RAJA::region<RAJA::omp_parallel_region>([=]() {
+
+              RAJA::forall<RAJA::omp_for_nowait_exec>(segment, [=] (int idx) {
+                 // do something at iterate 'idx'
+              });
+
+              RAJA::forall<RAJA::omp_for_exec>(segment, [=] (int idx) {
+                 // do something else at iterate 'idx'
+              });
+
+            });
+
+          Here, the ``RAJA::region<RAJA::omp_parallel_region>`` method call
+          creates an OpenMP parallel region, which contains two ``RAJA::forall``
+          kernels. The first uses the ``RAJA::omp_for_nowait_exec`` policy, 
+          meaning that no thread synchronization is needed after the kernel.
+          Thus, threads can start working on the second kernel while others
+          are still working on the first kernel. I general, this can only be 
+          guaranteed to be correct if the segments used in the two kernels
+          are the same and each loop is data parallel. The second kernel uses
+          the ``RAJA::omp_for_exec`` policy, which means that all threads will
+          complete before the kernel exits. In this example, this is not
+          really needed since there is no more code to execute in the parallel
+          region and there is an implicit barrier at the end of it.
+
+.. note:: As noted above, a *Scheduling Policy* can be specified for
+          ``omp_for_schedule_exec`` and ``omp_for_nowait_schedule_exec`` policies.
+          All possible schedules reside under the ``RAJA::policy::omp`` namespace
+
+          * ``Static<ChunkSize>`` equivilent to ``schedule(static, ChunkSize)``
+          * ``Dynamic<ChunkSize>`` equivilent to ``schedule(dynamic, ChunkSize)``
+          * ``Guided<ChunkSize>`` equivilent to ``schedule(guided, ChunkSize)``
+          * ``Runtime`` equivilent to ``schedule(runtime)``
+          * ``Auto`` equivilent to no schedule specified
+
+          There is a special identifier ``RAJA::policy::omp::default_chunk_size``
+          which can be used as the template argument to ``Static``, ``Dynamic``,
+          or ``Guided`` to defer to the implementation-defined default chunk size.
+
 .. note:: To control the number of TBB worker threads used by these policies:
           set the value of the environment variable 'TBB_NUM_WORKERS' (which is
           fixed for duration of run), or create a 'task_scheduler_init' object::
@@ -241,27 +303,27 @@ The following notes provide additional information about policy usage.
 
 Several notable constraints apply to RAJA CUDA *thread-direct* policies.
 
-.. note:: * Repeating thread direct policies with the same thread dimension  
-            in perfectly nested loops is not recommended. Your code may do 
+.. note:: * Repeating thread direct policies with the same thread dimension
+            in perfectly nested loops is not recommended. Your code may do
             something, but likely will not do what you expect and/or be correct.
-          * If multiple thread direct policies are used in a kernel (using 
-            different thread dimensions), the product of sizes of the 
-            corresponding iteration spaces cannot be greater than the 
-            maximum allowable threads per block. Typically, this is 
-            equ:math:`\leq` 1024; i.e., attempting to launch a CUDA kernel 
-            with more than 1024 threads per block will cause the CUDA runtime 
-            to complain about *illegal launch parameters.* 
-          * **Thread-direct policies are recommended only for certain loop 
+          * If multiple thread direct policies are used in a kernel (using
+            different thread dimensions), the product of sizes of the
+            corresponding iteration spaces cannot be greater than the
+            maximum allowable threads per block. Typically, this is
+            equ:math:`\leq` 1024; i.e., attempting to launch a CUDA kernel
+            with more than 1024 threads per block will cause the CUDA runtime
+            to complain about *illegal launch parameters.*
+          * **Thread-direct policies are recommended only for certain loop
             patterns, such as tiling.**
 
-Several notes regarding CUDA thread and block *loop* policies are also good to 
+Several notes regarding CUDA thread and block *loop* policies are also good to
 know.
 
-.. note:: * There is no constraint on the product of sizes of the associated 
+.. note:: * There is no constraint on the product of sizes of the associated
             loop iteration space.
-          * These polices allow having a larger number of iterates than 
+          * These polices allow having a larger number of iterates than
             threads in the x, y, or z thread dimension.
-          * **Cuda thread and block loop policies are recommended for most 
+          * **Cuda thread and block loop policies are recommended for most
             loop patterns.**
 
 Finally
@@ -296,18 +358,18 @@ available to use for the segment iteration policy:
 Execution Policy                       Brief description
 ====================================== =========================================
 **Serial**
-seq_segit                              Iterate over index set segments 
+seq_segit                              Iterate over index set segments
                                        sequentially
 
-**OpenMP CPU multithreading**          
-omp_parallel_segit                     Create OpenMP parallel region and 
-                                       iterate over segments in parallel inside                                        it; i.e., apply ``omp parallel for`` 
+**OpenMP CPU multithreading**
+omp_parallel_segit                     Create OpenMP parallel region and
+                                       iterate over segments in parallel inside                                        it; i.e., apply ``omp parallel for``
                                        pragma on loop over segments
 omp_parallel_for_segit                 Same as above
 
 **Intel Threading Building Blocks**
-tbb_segit                              Iterate over index set segments in 
-                                       parallel using a TBB 'parallel_for' 
+tbb_segit                              Iterate over index set segments in
+                                       parallel using a TBB 'parallel_for'
                                        method
 ====================================== =========================================
 
@@ -315,14 +377,14 @@ tbb_segit                              Iterate over index set segments in
 Parallel Region Policies
 -------------------------
 
-The following policies may only be used with the ``RAJA::region`` method. 
+The following policies may only be used with the ``RAJA::region`` method.
 ``RAJA::forall`` and ``RAJA::kernel`` methods may be used within a parallel
 region created with the ``RAJA::region`` construct.
 
 * ``seq_region`` - Create a sequential region (see note below).
 * ``omp_parallel_region`` - Create an OpenMP parallel region.
 
-For example, the following code will execute two consecutive loops in parallel 
+For example, the following code will execute two consecutive loops in parallel
 in an OpenMP parallel region without synchronizing threads between them::
 
   RAJA::region<RAJA::omp_parallel_region>( [=]() {
@@ -340,9 +402,9 @@ in an OpenMP parallel region without synchronizing threads between them::
   }); // end omp parallel region
 
 .. note:: The sequential region specialization is essentially a *pass through*
-          operation. It is provided so that if you want to turn off OpenMP in 
-          your code, you can simply replace the region policy type and you do 
-          not have to change your algorithm source code. 
+          operation. It is provided so that if you want to turn off OpenMP in
+          your code, you can simply replace the region policy type and you do
+          not have to change your algorithm source code.
 
 .. _reducepolicy-label:
 
@@ -367,7 +429,7 @@ Reduction Policy      Loop Policies Brief description
                       to Use With
 ===================== ============= ===========================================
 seq_reduce            seq_exec,     Non-parallel (sequential) reduction
-                      loop_exec 
+                      loop_exec
 omp_reduce            any OpenMP    OpenMP parallel reduction
                       policy
 omp_reduce_ordered    any OpenMP    OpenMP parallel reduction with result
@@ -377,7 +439,7 @@ omp_target_reduce     any OpenMP    OpenMP parallel target offload reduction
 tbb_reduce            any TBB       TBB parallel reduction
                       policy
 cuda_reduce           any CUDA      Parallel reduction in a CUDA kernel
-                      policy        (device synchronization will occur when 
+                      policy        (device synchronization will occur when
                                     reduction value is finalized)
 cuda_reduce_atomic    any CUDA      Same as above, but reduction may use CUDA
                       policy        atomic operations
@@ -395,7 +457,7 @@ Atomic Policies
 Each RAJA atomic operation must be defined with an 'atomic policy'
 type. Atomic policy types are distinct from loop execution policy types.
 
-.. note :: An atomic policy type must be consistent with the loop execution 
+.. note :: An atomic policy type must be consistent with the loop execution
            policy for the kernel in which the atomic operation is used. The
            following table summarizes RAJA atomic policies and usage.
 
@@ -405,21 +467,21 @@ Atomic Policy         Loop Policies Brief description
 ===================== ============= ===========================================
 seq_atomic            seq_exec,     Atomic operation performed in a non-parallel
                       loop_exec     (sequential) kernel
-omp_atomic            any OpenMP    Atomic operation performed in an OpenMP 
-                      policy        multithreading or target kernel; i.e., 
+omp_atomic            any OpenMP    Atomic operation performed in an OpenMP
+                      policy        multithreading or target kernel; i.e.,
                                     apply ``omp atomic`` pragma
 cuda_atomic           any CUDA      Atomic operation performed in a CUDA kernel
-                      policy        
+                      policy
 builtin_atomic        seq_exec,     Compiler *builtin* atomic operation
                       loop_exec,
                       any OpenMP
-                      policy        
+                      policy
 auto_atomic           seq_exec,     Atomic operation *compatible* with loop
                       loop_exec,    execution policy. See example below.
                       any OpenMP
                       policy,
                       any CUDA
-                      policy                 
+                      policy
 ===================== ============= ===========================================
 
 Here is an example illustrating use of the ``auto_atomic`` policy::
@@ -432,13 +494,13 @@ Here is an example illustrating use of the ``auto_atomic`` policy::
   });
 
 In this case, the atomic operation knows that it is used in a CUDA kernel
-context and the CUDA atomic operation is applied. Similarly, if an OpenMP 
-execution policy was used, the OpenMP version of the atomic operation would 
+context and the CUDA atomic operation is applied. Similarly, if an OpenMP
+execution policy was used, the OpenMP version of the atomic operation would
 be used.
 
 .. note:: * There are no RAJA atomic policies for TBB (Intel Threading Building
             Blocks) execution contexts at present.
-          * The ``builtin_atomic`` policy may be preferable to the 
+          * The ``builtin_atomic`` policy may be preferable to the
             ``omp_atomic`` policy in terms of performance.
 
 .. _localarraypolicy-label:
@@ -465,13 +527,13 @@ for ``RAJA::LocalArray`` objects:
 RAJA Kernel Execution Policies
 --------------------------------
 
-RAJA kernel execution policy constructs form a simple domain specific language 
-for composing and transforming complex loops that relies 
-**solely on standard C++11 template support**. 
+RAJA kernel execution policy constructs form a simple domain specific language
+for composing and transforming complex loops that relies
+**solely on standard C++11 template support**.
 RAJA kernel policies are constructed using a combination of *Statements* and
-*Statement Lists*. A RAJA Statement is an action, such as execute a loop, 
-invoke a lambda, set a thread barrier, etc. A StatementList is an ordered list 
-of Statements that are composed in the order that they appear in the kernel 
+*Statement Lists*. A RAJA Statement is an action, such as execute a loop,
+invoke a lambda, set a thread barrier, etc. A StatementList is an ordered list
+of Statements that are composed in the order that they appear in the kernel
 policy to construct a kernel. A Statement may contain an enclosed StatmentList. Thus, a ``RAJA::KernelPolicy`` type is really just a StatementList.
 
 The main Statement types provided by RAJA are ``RAJA::statement::For`` and
@@ -482,10 +544,10 @@ position of the item it applies to in the iteration space tuple argument to the
 ``RAJA::kernel`` method. The ExecPolicy is the RAJA execution policy to
 use on that loop/iteration space (similar to ``RAJA::forall``).
 EnclosedStatements contain whatever is nested within the template parameter
-list to form a StatementList, which will be executed for each iteration of 
-the loop. The ``RAJA::statement::Lambda<LambdaID>`` invokes the lambda 
-corresponding to its position (LambdaID) in the sequence of lambda expressions 
-in the ``RAJA::kernel`` argument list. For example, a simple sequential 
+list to form a StatementList, which will be executed for each iteration of
+the loop. The ``RAJA::statement::Lambda<LambdaID>`` invokes the lambda
+corresponding to its position (LambdaID) in the sequence of lambda expressions
+in the ``RAJA::kernel`` argument list. For example, a simple sequential
 for-loop::
 
   for (int i = 0; i < N; ++i) {
@@ -508,17 +570,17 @@ can be represented using the RAJA kernel interface as::
     }
   );
 
-.. note:: All ``RAJA::forall`` functionality can be done using the 
+.. note:: All ``RAJA::forall`` functionality can be done using the
           ``RAJA::kernel`` interface. We maintain the ``RAJA::forall``
           interface since it is less verbose and thus more convenient
           for users.
-   
+
 RAJA::kernel Statement Types
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The list below summarizes the current collection of statement types that
 can be used with ``RAJA::kernel`` and ``RAJA::kernel_param``. More detailed
-explanation along with examples of how they are used can be found in 
+explanation along with examples of how they are used can be found in
 :ref:`tutorial-label`.
 
 .. note:: * All of these statement types are in the namespace ``RAJA``.
@@ -531,12 +593,6 @@ explanation along with examples of how they are used can be found in
 
   * ``statement::Lambda< LambdaId, Args...>`` extension of the lambda statement; enabling lambda arguments to be specified at compile time.
 
-  * ``statement::Segs<...>`` argument to a Lambda statement; used to specify which segments in a tuple will be used as lambda arguments.
-
-  * ``statement::Offsets<...>`` argument to a Lambda statement; used to specify which segment offsets in a tuple will be used as lambda arguments.
-
-  * ``statement::Params<...>`` argument to a Lambda statement; used to specify which params in a tuple will be used as lambda arguments.
-
   * ``statement::Collapse< ExecPolicy, ArgList<...>, EnclosedStatements >`` collapses multiple perfectly nested loops specified by tuple iteration space indices in 'ArgList', using the 'ExecPolicy' execution policy, and places 'EnclosedStatements' inside the collapsed loops which are executed for each iteration. Note that this only works for CPU execution policies (e.g., sequential, OpenMP).It may be available for CUDA in the future if such use cases arise.
 
   * ``statement::CudaKernel< EnclosedStatements>`` launches 'EnclosedStatements' as a CUDA kernel; e.g., a loop nest where the iteration spaces of each loop level are associated with threads and/or thread blocks as described by the execution policies applied to them. This kernel launch is synchronous.
@@ -550,7 +606,7 @@ explanation along with examples of how they are used can be found in
   * ``statement::CudaKernelOcc<EnclosedStatements>`` similar to CudaKernel but uses the CUDA occupancy calculator to determine the optimal number of threads/blocks. Statement is intended for RAJA::cuda_block_{xyz}_loop policies. This kernel launch is synchronous.
 
   * ``statement::CudaKernelOccAsync<EnclosedStatements>`` asynchronous version of CudaKernelOcc.
-  
+
   * ``statement::CudaKernelExp<num_blocks, num_threads, EnclosedStatements>`` similar to CudaKernelOcc but with the flexibility to fix the number of threads and/or blocks and let the CUDA occupancy calculator determine the unspecified values. This kernel launch is synchronous.
 
   * ``statement::CudaKernelExpAsync<num_blocks, num_threads, EnclosedStatements>`` asynchronous version of CudaKernelExp.
@@ -567,15 +623,30 @@ explanation along with examples of how they are used can be found in
 
   * ``statement::TileTCount< ArgId, ParamId, TilePolicy, ExecPolicy, EnclosedStatements >`` abstracts an outer tiling loop containing an inner for-loop over each tile, **where it is necessary to obtain the tile number in each tile**. The 'ArgId' indicates which entry in the iteration space tuple to which the loop applies and the 'ParamId' indicates the position of the tile number in the parameter tuple. The 'TilePolicy' specifies the tiling pattern to use, including its dimension. The 'ExecPolicy' and 'EnclosedStatements' are similar to what they represent in a ``statement::For`` type.
 
-  * ``statement::tile_fixed<TileSize>`` partitions loop iterations into tiles of a fixed size specified by 'TileSize'. This statement type can be used as the 'TilePolicy' template paramter in the Tile statements above.
-
   * ``statement::ForICount< ArgId, ParamId, ExecPolicy, EnclosedStatements >`` abstracts an inner for-loop within an outer tiling loop **where it is necessary to obtain the local iteration index in each tile**. The 'ArgId' indicates which entry in the iteration space tuple to which the loop applies and the 'ParamId' indicates the position of the tile index parameter in the parameter tuple. The 'ExecPolicy' and 'EnclosedStatements' are similar to what they represent in a ``statement::For`` type.
 
-  * ``RAJA::statement::Reduce< ReducePolicy, Operator, ParamId, EnclosedStatements >`` reduces a value across threads to a single thread. The 'ReducePolicy' is similar to what it represents for RAJA reduction types. 'ParamId' specifies the position of the reduction value in the parameter tuple passed to the ``RAJA::kernel_param`` method. 'Operator' is the binary operator used in the reduction; typically, this will be one of the operators that can be used with RAJA scans (see :ref:`scanops-label`. After the reduction is complete, the 'EnclosedStatements' execute on the thread that received the final reduced value.
+  * ``statement::Reduce< ReducePolicy, Operator, ParamId, EnclosedStatements >`` reduces a value across threads to a single thread. The 'ReducePolicy' is similar to what it represents for RAJA reduction types. 'ParamId' specifies the position of the reduction value in the parameter tuple passed to the ``RAJA::kernel_param`` method. 'Operator' is the binary operator used in the reduction; typically, this will be one of the operators that can be used with RAJA scans (see :ref:`scanops-label`. After the reduction is complete, the 'EnclosedStatements' execute on the thread that received the final reduced value.
 
   * ``statement::If< Conditional >`` chooses which portions of a policy to run based on run-time evaluation of conditional statement; e.g., true or false, equal to some value, etc.
 
   * ``statement::Hyperplane< ArgId, HpExecPolicy, ArgList<...>, ExecPolicy, EnclosedStatements >`` provides a hyperplane (or wavefront) iteration pattern over multiple indices. A hyperplane is a set of multi-dimensional index values: i0, i1, ... such that h = i0 + i1 + ... for a given h. Here, 'ArgId' is the position of the loop argument we will iterate on (defines the order of hyperplanes), 'HpExecPolicy' is the execution policy used to iterate over the iteration space specified by ArgId (often sequential), 'ArgList' is a list of other indices that along with ArgId define a hyperplane, and 'ExecPolicy' is the execution policy that applies to the loops in ArgList. Then, for each iteration, everything in the 'EnclosedStatements' is executed.
 
+
+The following list summarizes auxillary types used in the above statments. These
+types live in the ``RAJA`` namespace.
+
+  * ``tile_fixed<TileSize>`` tile policy argument to a ``Tile`` or ``TileTCount`` statement; partitions loop iterations into tiles of a fixed size specified by 'TileSize'. This statement type can be used as the 'TilePolicy' template paramter in the ``Tile`` statements above.
+ 
+  * ``tile_dynamic<ParamIdx>`` TilePolicy argument to a Tile or TileTCount statement; partitions loop iterations into tiles of a size specified by a ``TileSize{}`` positional parameter argument. This statement type can be used as the 'TilePolicy' template paramter in the ``Tile`` statements above.
+
+  * ``Segs<...>`` argument to a Lambda statement; used to specify which segments in a tuple will be used as lambda arguments.
+
+  * ``Offsets<...>`` argument to a Lambda statement; used to specify which segment offsets in a tuple will be used as lambda arguments.
+
+  * ``Params<...>`` argument to a Lambda statement; used to specify which params in a tuple will be used as lambda arguments.
+
+  * ``ValuesT<T, ...>`` argument to a Lambda statement; used to specify compile time constants, of type T, that will be used as lambda arguments.
+
+
 Examples that show how to use a variety of these statement types can be found
 in :ref:`tutorialcomplex-label`.
diff --git a/docs/sphinx/user_guide/feature/reduction.rst b/docs/sphinx/user_guide/feature/reduction.rst
index 090858b4a2..d3b6112c23 100644
--- a/docs/sphinx/user_guide/feature/reduction.rst
+++ b/docs/sphinx/user_guide/feature/reduction.rst
@@ -13,9 +13,9 @@ Reduction Operations
 ====================
 
 RAJA does not provide separate loop execution methods for loops containing
-reduction operations like some other C++ loop programming abstraction models do.
+reduction operations like some other C++ loop programming abstraction models.
 Instead, RAJA provides reduction types that allow users to perform reduction 
-operations in ``RAJA::forall`` and ``RAJA::kernel`` methods in a portable, 
+operations in ``RAJA::forall`` and ``RAJA::kernel`` kernels in a portable, 
 thread-safe manner. Users may use as many reduction objects in a loop kernel
 as they need. Available RAJA reduction types are described in this section.
 
@@ -27,11 +27,14 @@ A detailed example of RAJA reduction usage can be found in
 Also
 
 .. note:: * Each RAJA reduction type is templated on a **reduction policy** 
-            and a **reduction value type** for the reduction variable.
-          * Each RAJA reduction type accepts an **initial reduction value** at
-            construction.
-          * Each RAJA reduction type has a 'get' method to access its reduced
-            value after kernel execution completes.
+            and a **reduction value type** for the reduction variable. The
+            **reduction policy type must be compatibe with the execution
+            policy used by the kernel.** For example, in a CUDA kernel,
+            a CUDA reduction policy must be used. 
+          * Each RAJA reduction type accepts an **initial reduction value or 
+            values** at construction (see below).
+          * Each RAJA reduction type has a 'get' method to access reduced
+            values after kernel execution completes.
 
 
 ----------------
@@ -50,13 +53,28 @@ RAJA supports five common reduction types:
 
 * ``ReduceMaxLoc< reduce_policy, data_type >`` - Max value and a loop index where the maximum was found.
 
+and two less common bitwise reduction types:
+
+* ``ReduceBitAnd< reduce_policy, data_type >`` - Bitwise 'and' of values (i.e., ``a & b``).
+
+* ``ReduceBitOr< reduce_policy, data_type >`` - Bitwise 'or' of values (i.e., ``a | b``).
+
 .. note:: * When ``RAJA::ReduceMinLoc`` and ``RAJA::ReduceMaxLoc`` are used 
             in a sequential execution context, the loop index of the 
             min/max is the first index where the min/max occurs.
-          * When the 'loc' reductions are used in a parallel execution context, 
-            the loop index given for the reduction value may be any index 
+          * When these reductions are used in a parallel execution context, 
+            the loop index computed for the reduction value may be any index 
             where the min or max occurs. 
 
+.. note:: ``RAJA::ReduceBitAnd`` and ``RAJA::ReduceBitOr`` reduction types are designed to work on integral data types because **in C++, at the language level, there is no such thing as a bitwise operator on floating-point numbers.**
+
+-------------------
+Reduction Examples
+-------------------
+
+Next, we provide a few examples to illustrate basic usage of RAJA reduction
+types.
+
 Here is a simple RAJA reduction example that shows how to use a sum reduction 
 type and a min-loc reduction type::
 
@@ -64,15 +82,19 @@ type and a min-loc reduction type::
 
   //
   // Initialize array of length N with all ones. Then, set some other
-  // values to make the example mildly interesting...
+  // values in the array to make the example mildly interesting...
   //
   int vec[N] = {1};
   vec[100] = -10; vec[500] = -10;
 
-  // Create sum and min-loc reduction objects with initial values
+  // Create a sum reduction object with initial value of zero
   RAJA::ReduceSum< RAJA::omp_reduce, int > vsum(0);
+
+  // Create a min-loc reduction object with initial min value of 100
+  // and initial location index value of -1
   RAJA::ReduceMinLoc< RAJA::omp_reduce, int > vminloc(100, -1);
 
+  // Run a kernel using the reduction objects
   RAJA::forall<RAJA::omp_parallel_for_exec>( RAJA::RangeSegment(0, N),
     [=](RAJA::Index_type i) {
 
@@ -81,6 +103,7 @@ type and a min-loc reduction type::
 
   });
 
+  // After kernel is run, extract the reduced values
   int my_vsum = static_cast<int>(vsum.get());
 
   int my_vmin = static_cast<int>(vminloc.get());
@@ -94,7 +117,37 @@ The results of these operations will yield the following values:
 
 Note that the location index for the minimum array value can be one of two
 values depending on the order of the reduction finalization since the loop
-is run in parallel.
+is run in parallel. Also, note that the reduction objects are created using
+a ``RAJA::omp_reduce`` reduction policy, which is compatible with the 
+OpenMP execution policy used in the kernel.
+
+Here is an example of a bitwise or reduction::
+
+  const int N = 100;
+
+  //
+  // Initialize all entries in array of length N to the value '9'
+  //
+  int vec[N] = {9};
+
+  // Create a bitwise or reduction object with initial value of '5'
+  RAJA::ReduceBitOr< RAJA::omp_reduce, int > my_or(5);
+
+  // Run a kernel using the reduction object
+  RAJA::forall<RAJA::omp_parallel_for_exec>( RAJA::RangeSegment(0, N),
+    [=](RAJA::Index_type i) {
+
+    my_or |= vec[i];
+
+  });
+
+  // After kernel is run, extract the reduced value
+  int my_or_reduce_val = static_cast<int>(my_or.get());
+
+The result of the reduction is the value '13'. In binary representation
+(i.e., bits), :math:`9 = ...01001` (the vector entries) and 
+:math:`5 = ...00101` (the initial reduction value). 
+So :math:`9 | 5 = ...01001 | ...00101 = ...01101 = 13`.
 
 -------------------
 Reduction Policies
diff --git a/docs/sphinx/user_guide/feature/resource.rst b/docs/sphinx/user_guide/feature/resource.rst
new file mode 100644
index 0000000000..1daf169299
--- /dev/null
+++ b/docs/sphinx/user_guide/feature/resource.rst
@@ -0,0 +1,286 @@
+.. ##
+.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+.. ## and other RAJA project contributors. See the RAJA/COPYRIGHT file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _resource-label:
+
+=========
+Resources
+=========
+
+This section describes the basic concepts of Resource types and their 
+functionality in ``RAJA::forall``. Resources are used as an interface to 
+various backend constructs and their respective hardware. Currently there 
+exists Resource types for ``Cuda``, ``Hip``, ``Omp`` (target) and ``Host``. 
+Resource objects allow the user to execute ``RAJA::forall`` calls 
+asynchronously on a respective thread/stream. The underlying concept of each 
+individual Resource is still under development and it should be considered 
+that functionality / behaviour may change.
+
+.. note:: * Currently feature complete asynchronous behavior and 
+            streamed/threaded support is available only for ``Cuda`` and 
+            ``Hip`` resources. 
+          * The ``RAJA::resources`` namespace aliases the ``camp::resources`` 
+            namespace.
+
+Each resource has a set of underlying functionality that is synonymous across 
+all resource types.  
+
+ ===================== ===============================================
+ Methods               Brief description
+ ===================== ===============================================
+ get_platform          Returns the underlying camp platform
+                       the resource is associated with.
+ get_event             Return an Event object for the resource from
+                       the last resource call.
+ allocate              Allocate data per the resource's given
+                       backend.
+ deallocate            Deallocate data per the resource's given
+                       backend.
+ memcpy                Perform a memory copy from a src location
+                       to a destination location from the
+                       resource's backend.
+ memset                Set memory value per the resourse's
+                       given backend.
+ wait_for              Enqueue a wait on the resource's stream/thread
+                       for a user passed event to occur.
+ ===================== ===============================================
+
+.. note:: ``deallocate``, ``memcpy`` and ``memset`` will only work with 
+          pointers that correspond to memory locations that have been 
+          allocated on the resource's respective device.
+  
+Each resource type also defines specific backend information/functionality. 
+For example, each CUDA resource contains a ``cudaStream_t`` value with an 
+associated get method. See the individual functionality for each resource 
+in ``raja/tpl/camp/include/resource/``.
+
+.. note:: Stream IDs are assigned to resources in a round robin fashion. The 
+          number of independent streams for a given backend is limited to the 
+          maximum number of concurrent streams that the back-end supports. 
+
+------------
+Type-Erasure
+------------
+
+Resources can be declared in two formats: An erased resource, and a concrete 
+resource. The underlying runtime functionality is the same for both formats. 
+An erased resource allows a user the ability to change the resource backend 
+at runtime. 
+
+Concrete CUDA resource::
+
+    RAJA::resources::Cuda my_cuda_res;
+
+Erased resource::
+
+    if (use_gpu)
+      RAJA::resources::Resource my_res{RAJA::resources::Cuda()};
+    else
+      RAJA::resources::Resource my_res{RAJA::resources::Host()};
+
+
+Memory allocation on resources::
+
+    int* a1 = my_cuda_res.allocate<int>(ARRAY_SIZE);
+    int* a2 = my_res.allocate<int>(ARRAY_SIZE);
+
+If ``use_gpu`` is ``true``, then the underlying type of ``my_res`` is a CUDA 
+resource. Therefore ``a1`` and ``a2`` will both be allocated on the GPU. If 
+``use_gpu`` is ``false``, then only ``a1`` is allocated on the GPU, and 
+``a2`` is allocated on the host.
+
+
+------
+Forall
+------
+
+A resource is an optional argument to a ``RAJA::forall`` call. When used, 
+it is passed as the first argument to the method::
+
+    RAJA::forall<ExecPol>(my_gpu_res, .... )
+
+When specifying a CUDA or HIP resource, the ``RAJA::forall`` is executed 
+aynchronously on a stream. Currently, CUDA and HIP are the only Resources 
+that enable asynchronous threading with a ``RAJA::forall``. All other calls 
+default to using the ``Host`` resource until further support is added.
+
+The Resource type that is passed to a ``RAJA::forall`` call must be a concrete 
+type. This is to allow for a compile-time assertion that the resource is not
+compatible with the given execution policy. For example::
+    
+    using ExecPol = RAJA::cuda_exec_async<BLOCK_SIZE>;
+    RAJA::resources::Cuda my_cuda_res;
+    RAJA::resources::Resource my_res{RAJA::resources::Cuda()};
+    RAJA::resources::Host my_host_res;
+
+    RAJA::forall<ExecPol>(my_cuda_res, .... ) // Compiles.
+    RAJA::forall<ExecPol>(my_res, .... )      // Compilation Error. Not Concrete.
+    RAJA::forall<ExecPol>(my_host_res, .... ) // Compilation Error. Mismatched Resource and Exec Policy.
+
+Below is a list of the currently available concrete resource types and their 
+execution policy suport.
+
+ ======== ==============================
+ Resource Policies supported
+ ======== ==============================
+ Cuda     | cuda_exec
+          | cuda_exec_async
+ Hip      | hip_exec
+          | hip_exec_async
+ Omp*     | omp_target_parallel_for_exec
+          | omp_target_parallel_for_exec_n
+ Host     | loop_exec
+          | seq_exec
+          | openmp_parallel_exec
+          | omp_for_schedule_exec
+          | omp_for_nowait_schedule_exec
+          | simd_exec
+          | tbb_for_dynamic
+          | tbb_for_static
+ ======== ==============================
+
+.. note:: The ``RAJA::resources::Omp`` resource is still under development.
+
+IndexSet policies require two execution policies (see :ref:`indexsets-label`). 
+Currently, users may only pass a single resource to a forall method taking
+an IndexSet argument. This resource is used for the inner execution of 
+each Segment in the IndexSet::
+
+    using ExecPol = RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<256>>;
+    RAJA::forall<ExecPol>(my_cuda_res, iset,  .... );
+
+
+When a resource is not provided by the user, a *default* resource is assigned,
+which can be accessed in a number of ways. It can be accessed directly from 
+the concrete resource type::
+
+    RAJA::resources::Cuda my_default_cuda = RAJA::resources::Cuda::get_default();
+
+The resource type can also be deduced from an execution policy::
+
+    using Res = RAJA::resources::get_resource<ExecPol>::type;
+    Res r = Res::get_default();
+
+Finally, the resource type can be deduced from an execution policy::
+
+    auto my_resource = RAJA::resources::get_default_resource<ExecPol>();
+
+.. note:: For CUDA and HIP, the default resource is *NOT* the CUDA or HIP 
+          default stream. It is its own stream defined in 
+          ``camp/include/resource/``. This is an attempt to break away
+          from some of the issues that arise from the synchronization behaviour
+          of the CUDA and HIP default streams. It is still possible to use the 
+          CUDA and HIP default streams as the default resource. This can be 
+          enabled by defining the environment variable 
+          ``CAMP_USE_PLATFORM_DEFAULT_STREAM`` before compiling RAJA in a 
+          project.
+
+------
+Events
+------
+
+Event objects allow users to wait or query the status of a resource's action. An
+event can be returned from a resource::
+
+    RAJA::resources::Event e = my_res.get_event();
+
+Getting an event like this enqueues an event object for the given back-end. 
+
+Users can call the *blocking* ``wait`` function on the event::
+
+    e.wait();
+
+Preferably, users can enqueue the event on a specific resource, forcing only 
+that resource to wait for the event::
+
+    my_res.wait_for(&e);
+
+The usage allows one to set up dependencies between resource objects and 
+``RAJA::forall`` calls.
+
+.. note:: An Event object is only created if a user explicitly sets the event 
+          returned by the ``RAJA::forall`` call to a variable. This avoids 
+          unnecessary event objects being created when not needed. For example::
+    
+               forall<cuda_exec_async<BLOCK_SIZE>>(my_cuda_res, ...
+
+          will *not* generate a cudaStreamEvent, whereas::
+
+                RAJA::resources::Event e = forall<cuda_exec_async<BLOCK_SIZE>>(my_cuda_res, ...
+
+          will generate a cudaStreamEvent.
+
+-------
+Example
+-------
+
+This example executes three kernels across two cuda streams on the GPU with 
+a requirement that the first and second kernel finish execution before 
+launching the third. It also demonstrates copying memory from the device 
+to host on a resource:
+    
+First, define two concrete CUDA resources and one host resource:
+
+.. literalinclude:: ../../../../examples/resource-forall.cpp
+   :start-after: _raja_res_defres_start
+   :end-before: _raja_res_defres_end
+   :language: C++
+
+Next, allocate data for two device arrays and one host array:
+
+.. literalinclude:: ../../../../examples/resource-forall.cpp
+   :start-after: _raja_res_alloc_start
+   :end-before: _raja_res_alloc_end
+   :language: C++
+
+Then, Execute a kernel on CUDA stream 1 ``res_gpu1``:
+
+.. literalinclude:: ../../../../examples/resource-forall.cpp
+   :start-after: _raja_res_k1_start
+   :end-before: _raja_res_k1_end
+   :language: C++
+    
+and execute another kernel on  CUDA stream 2 ``res_gpu2`` storing a handle to
+an ``Event`` object to a local variable:
+
+.. literalinclude:: ../../../../examples/resource-forall.cpp
+   :start-after: _raja_res_k2_start
+   :end-before: _raja_res_k2_end
+   :language: C++
+    
+The next kernel on ``res_gpu1`` requires that the last kernel on ``res_gpu2`` 
+finish first. Therefore, we enqueue a wait on ``res_gpu1`` that enforces 
+this:
+
+.. literalinclude:: ../../../../examples/resource-forall.cpp
+   :start-after: _raja_res_wait_start
+   :end-before: _raja_res_wait_end
+   :language: C++
+    
+Execute the second kernel on ``res_gpu1`` now that the two previous kernels
+have finished:
+
+.. literalinclude:: ../../../../examples/resource-forall.cpp
+   :start-after: _raja_res_k3_start
+   :end-before: _raja_res_k3_end
+   :language: C++
+    
+We can enqueue a memcpy operation on ``res_gpu1`` to move data from the device 
+to the host:
+
+.. literalinclude:: ../../../../examples/resource-forall.cpp
+   :start-after: _raja_res_memcpy_start
+   :end-before: _raja_res_memcpy_end
+   :language: C++
+    
+Lastly, we use the copied data on the host side:
+
+.. literalinclude:: ../../../../examples/resource-forall.cpp
+   :start-after: _raja_res_k4_start
+   :end-before: _raja_res_k4_end
+   :language: C++
diff --git a/docs/sphinx/user_guide/feature/scan.rst b/docs/sphinx/user_guide/feature/scan.rst
index f0203663eb..3730e08d60 100644
--- a/docs/sphinx/user_guide/feature/scan.rst
+++ b/docs/sphinx/user_guide/feature/scan.rst
@@ -28,13 +28,12 @@ A few important notes:
 
 Also:
 
-.. note:: For scans using the CUDA back-end, RAJA uses the implementations
-          provided by the NVIDIA cub library, which is available in the 
-          RAJA source repository as a Git submodule. The CMake variable 
-          ``CUB_DIR`` will be automatically set to the location of the cub 
-          library when CUDA is enabled; to use a different version of the
-          cub library, install it and set the ``CUB_DIR`` variable to the
-          desired location when running CMake.
+.. note:: For scans using the CUDA back-end, RAJA uses the NVIDIA cub library
+          internally, which is available in the RAJA source repository as a 
+          Git submodule. The CMake variable ``CUB_DIR`` will be automatically 
+          set to the location of the cub library when CUDA is enabled. Details
+          for using a different version of the cub library are available in
+          the :ref:`getting_started-label` section.
 
 Please see the :ref:`scan-label` tutorial section for usage examples of RAJA
 scan operations.
@@ -111,6 +110,8 @@ Using RAJA exclusive scans is essentially the same as for inclusive scans:
  * ``RAJA::exclusive_scan< exec_policy >(in, in + N, out)``
  * ``RAJA::exclusive_scan< exec_policy >(in, in + N, out, operator)``
 
+and
+
  * ``RAJA::exclusive_scan_inplace< exec_policy >(in, in + N)``
  * ``RAJA::exclusive_scan_inplace< exec_policy >(in, in + N, <operator>)``
 
diff --git a/docs/sphinx/user_guide/feature/sort.rst b/docs/sphinx/user_guide/feature/sort.rst
new file mode 100644
index 0000000000..c172559b49
--- /dev/null
+++ b/docs/sphinx/user_guide/feature/sort.rst
@@ -0,0 +1,161 @@
+.. ##
+.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+.. ## and other RAJA project contributors. See the RAJA/COPYRIGHT file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _sort-label:
+
+================
+Sorts
+================
+
+RAJA provides portable parallel sort operations, which are basic
+parallel algorithm building blocks. They are described in this section.
+
+A few important notes:
+
+.. note:: * All RAJA sort operations are in the namespace ``RAJA``.
+          * Each RAJA sort operation is a template on an *execution policy*
+            parameter. The same policy types used for ``RAJA::forall`` methods
+            may be used for RAJA sorts.
+          * RAJA sort operations accept an optional *comparator* argument so
+            users can perform different types of sort operations. If
+            no operator is given, the default is a 'less' operation and
+            the result is **non-decreasing**.
+
+Also:
+
+.. note:: * For sorts using the CUDA back-end, RAJA uses the implementations
+            provided by the NVIDIA cub library. For information please see
+            :ref:`build-external-tpl <build-external-tpl-label>`.
+          * The RAJA CUDA back-end implementation only supports sorting
+            arithmetic types using RAJA operators less and greater.
+
+Please see the :ref:`sort-label` tutorial section for usage examples of RAJA
+sort operations.
+
+-----------------
+Sort Operations
+-----------------
+
+In general, a sort operation takes a sequence of numbers ``x`` and a binary
+comparison operator ``op`` that forms a strict weak ordering of elements in input
+sequence ``x`` and produces a sequence of numbers ``y`` as output. The output sequence
+is a permutation of the input sequence where each pair of elements ``a`` and ``b``,
+where ``a`` is before ``b`` in the output sequence, satisfies ``!(b op a)``.
+Sorts are stable if they always preserve the order of equivalent elements,
+where equivalent elements satisfy ``!(a op b) && !(b op a)``.
+
+A **stable sort** takes an input sequence ``x`` where equivalent elements a\ :sub:`i`
+and a\ :sub:`j` for any i != j where a\ :sub:`i` appears before a\ :sub:`j` if i < j
+
+   x = { a\ :sub:`0`\, b\ :sub:`0`\, a\ :sub:`1`\, ... }
+
+and calculates the stably sorted output sequence ``y`` which preserves the order of
+equivalent elements, in other words the sorted sequence where element a\ :sub:`i`
+appears before the equivalent element a\ :sub:`j` if i < j:
+
+   y = { a\ :sub:`0`\, a\ :sub:`1`\, b\ :sub:`0`\, ... }
+
+An **unstable sort** may not preserve the order of equivalent elements and
+may produce either of the following output sequences.
+
+   y = { a\ :sub:`0`\, a\ :sub:`1`\, b\ :sub:`0`\, ... }
+
+   or
+
+   y = { a\ :sub:`1`\, a\ :sub:`0`\, b\ :sub:`0`\, ... }
+
+---------------------
+RAJA Unstable Sorts
+---------------------
+
+RAJA unstable sort operations look like the following:
+
+ * ``RAJA::sort< exec_policy >(container)``
+ * ``RAJA::sort< exec_policy >(container, comparator)``
+ * ``RAJA::sort< exec_policy >(iter, iter + N)``
+ * ``RAJA::sort< exec_policy >(iter, iter + N, comparator)``
+
+For example sorting the ``in`` array filled with this sequence of values::
+
+   6 7 2 1 0 9 4 8 5 3 4 9 6 3 7 0 1 8 2 5
+
+by performing a sequential unstable sort operation using the following code:
+
+.. literalinclude:: ../../../../examples/tut_sort.cpp
+   :start-after: _sort_seq_start
+   :end-before: _sort_seq_end
+   :language: C++
+
+fills the ``out`` array with this sequence of values::
+
+   0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9
+
+Note that the syntax is essentially the same as :ref:`scan-label`.
+Here, ``container`` is a range of elements and ``iter`` is a random access
+iterator to a range of elements. ``container`` and ``iter`` provide access to the
+input sequence and contain the output sequence at the end of sort. The first
+and third sort operations above will be *non-decreasing* sorts since there is
+no comparator argument given; i.e., the sequences will be reordered *in-place*
+using operator::less. The second and fourth sorts will apply the comparator
+that is passed into the function.
+
+RAJA also provides sort pairs that operate on key, value pairs stored
+separately:
+
+ * ``RAJA::sort_pairs< exec_policy >(keys_container, vals_container)``
+ * ``RAJA::sort_pairs< exec_policy >(keys_container, vals_container, comparator)``
+ * ``RAJA::sort_pairs< exec_policy >(keys_iter, keys_iter + N, vals_iter)``
+ * ``RAJA::sort_pairs< exec_policy >(keys_iter, keys_iter + N, vals_iter, comparator)``
+
+Sort pairs generates the same output sequence of keys in ``keys_container`` or
+``keys_iter`` as sort does in ``container`` or ``iter`` and also reorders the sequence
+of values in ``vals_container`` or ``vals_iter`` by permuting the sequence of values
+in the same manner as the sequence of keys; i.e. sorting the sequence of pairs
+based on their keys. Note that the ``comparator`` used in sort_pairs only compares
+keys.
+
+---------------------
+RAJA Stable Sorts
+---------------------
+
+Using RAJA stable sorts is essentially the same as unstable sorts:
+
+ * ``RAJA::stable_sort< exec_policy >(container)``
+ * ``RAJA::stable_sort< exec_policy >(container, comparator)``
+ * ``RAJA::stable_sort< exec_policy >(iter, iter + N)``
+ * ``RAJA::stable_sort< exec_policy >(iter, iter + N, comparator)``
+
+RAJA also provides stable sort pairs that operate on key, value pairs stored
+separately:
+
+ * ``RAJA::stable_sort_pairs< exec_policy >(keys_container, vals_container)``
+ * ``RAJA::stable_sort_pairs< exec_policy >(keys_container, vals_container, comparator)``
+ * ``RAJA::stable_sort_pairs< exec_policy >(keys_iter, keys_iter + N, vals_iter)``
+ * ``RAJA::stable_sort_pairs< exec_policy >(keys_iter, keys_iter + N, vals_iter, comparator)``
+
+.. _sortops-label:
+
+--------------------
+RAJA Comparison Operators
+--------------------
+
+RAJA provides two operators that can be used to produce different ordered sorts:
+
+  * ``RAJA::operators::less<T>``
+  * ``RAJA::operators::greater<T>``
+
+.. note:: * All RAJA comparison operators are in the namespace ``RAJA::operators``.
+
+-------------------
+Sort Policies
+-------------------
+
+For information about RAJA execution policies to use with sort operations,
+please see :ref:`policies-label`.
+
+
diff --git a/docs/sphinx/user_guide/feature/tiling.rst b/docs/sphinx/user_guide/feature/tiling.rst
index b2bb1316df..907905b743 100644
--- a/docs/sphinx/user_guide/feature/tiling.rst
+++ b/docs/sphinx/user_guide/feature/tiling.rst
@@ -16,8 +16,8 @@ In this section, we discuss RAJA statements that can be used to tile nested
 for-loops. Typical loop tiling involves partitioning an iteration space into 
 a collection of "tiles" and then iterating over tiles in outer loops and 
 entries within each tile in inner loops. Many scientific computing algorithms 
-can benefit from loop tiling due to more efficient cache usage and other 
-considerations. 
+can benefit from loop tiling due to more efficient cache usage on a CPU or
+use of GPU shared memory.
 
 For example, an operation performed using a for-loop with a range of [0, 10)::
 
@@ -44,7 +44,7 @@ statement types.
 
    using KERNEL_EXEC_POL =
      RAJA::KernelPolicy<
-       RAJA::statement::Tile<0, RAJA::statement::tile_fixed<2>, RAJA::seq_exec,
+       RAJA::statement::Tile<0, RAJA::tile_fixed<2>, RAJA::seq_exec,
          RAJA::statement::For<0, RAJA::seq_exec,
            RAJA::statement::Lambda<0>
          >
@@ -59,11 +59,11 @@ statement types.
 In RAJA, the simplest way to tile an iteration space is to use RAJA 
 ``statement::Tile`` and ``statement::For`` statement types. A
 ``statement::Tile`` type is similar to a ``statement::For`` type, but takes
-a tile size as the second template argument. The Tile statement generates
-the outer loop over tiles and the For statement iterates over each tile. 
-Nested together, as in the example, these statements will pass the global
-index 'i' to the loop body in the lambda expression as in the non-tiled 
-version above.
+a tile size as the second template argument. The ``statement::Tile`` 
+construct generates the outer loop over tiles and the ``statement::For`` 
+statement iterates over each tile.  Nested together, as in the example, these 
+statements will pass the global index 'i' to the loop body in the lambda 
+expression as in the non-tiled version above.
 
 .. note:: When using ``statement::Tile`` and ``statement::For`` types together
           to define a tiled loop structure, the integer passed as the first
@@ -71,13 +71,13 @@ version above.
           indicates that they both apply to the same item in the iteration
           space tuple passed to the ``RAJA::kernel`` methods.
 
-RAJA also provides alternative Tile and For statements that provide the tile 
+RAJA also provides alternative tiling and for statements that provide the tile 
 number and local tile index, if needed inside the kernel body, as shown below::
 
   using KERNEL_EXEC_POL2 =
     RAJA::KernelPolicy<
       RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, 
-                                  RAJA::statement::tile_fixed<2>, RAJA::seq_exec,
+                                  RAJA::tile_fixed<2>, RAJA::seq_exec,
         RAJA::statement::ForICount<0, RAJA::statement::Param<1>, 
                                    RAJA::seq_exec,
           RAJA::statement::Lambda<0>
@@ -98,16 +98,17 @@ number and local tile index, if needed inside the kernel body, as shown below::
    });
 
 The ``statement::TileTCount`` type allows the tile number to be accessed as a
-parameter and the ``statement::ForICount`` type allows the local tile loop 
-index to be accessed. These values are specified in the tuple, which is the
-second argument passed to the ``RAJA::kernel_param`` method above. The 
-``statement::Param<#>`` type appearing as the second template parameter for
-each statement type indicates which parameter tuple entry the tile number
-or local tile loop index is passed to the lambda, and in what order. Here,
-the tile number is the second lambda argument (tuple parameter '0') and the
-local tile loop index is the third lambda argument (tuple parameter '1').
+lambda argument and the ``statement::ForICount`` type allows the local tile 
+loop index to be accessed as a lambda argument. These values are specified in 
+the tuple, which is the second argument passed to the ``RAJA::kernel_param`` 
+method above. The ``statement::Param<#>`` type appearing as the second 
+template parameter for each statement type indicates which parameter tuple 
+entry the tile number or local tile loop index is passed to the lambda, and 
+in which order. Here, the tile number is the second lambda argument (tuple 
+parameter '0') and the local tile loop index is the third lambda argument 
+(tuple parameter '1').
 
 .. note:: The global loop indices always appear as the first lambda expression
-          arguments. Then, the parameter tuples, identified by the integers 
+          arguments. Then, the parameter tuples identified by the integers 
           in the ``Param`` statement types given for the loop statement 
           types follow. 
diff --git a/docs/sphinx/user_guide/feature/view.rst b/docs/sphinx/user_guide/feature/view.rst
index dfc4472eff..d46930c1c6 100644
--- a/docs/sphinx/user_guide/feature/view.rst
+++ b/docs/sphinx/user_guide/feature/view.rst
@@ -12,8 +12,8 @@
 View and Layout
 ===============
 
-Matrix and tensor objects are naturally expressed in
-scientific computing applications as multi-dimensional arrays. However,
+Matrix and tensor objects, which are common in scientific computing 
+applications, are naturally expressed as multi-dimensional arrays. However,
 for efficiency in C and C++, they are usually allocated as one-dimensional
 arrays. For example, a matrix :math:`A` of dimension :math:`N_r \times N_c` is
 typically allocated as::
@@ -22,13 +22,13 @@ typically allocated as::
 
 Using a one-dimensional array makes it necessary to convert
 two-dimensional indices (rows and columns of a matrix) to a one-dimensional
-pointer offset index to access the corresponding array memory location. One 
-could introduce a macro such as::
+pointer offset to access the corresponding array memory location. One 
+could use a macro such as::
 
    #define A(r, c) A[c + N_c * r]
 
 to access a matrix entry in row `r` and column `c`. However, this solution has
-limitations; e.g., additional macro definitions are needed when adopting a 
+limitations; e.g., additional macro definitions may be needed when adopting a 
 different matrix data layout or when using other matrices. To facilitate
 multi-dimensional indexing and different indexing layouts, RAJA provides 
 ``RAJA::View`` and ``RAJA::Layout`` classes.
@@ -37,8 +37,8 @@ multi-dimensional indexing and different indexing layouts, RAJA provides
 RAJA Views
 ----------
 
-A ``RAJA::View`` object wraps a pointer and enables various indexing schemes
-based on the definition of a ``RAJA::Layout`` object. We can
+A ``RAJA::View`` object wraps a pointer and enables indexing into the data
+referenced via the pointer based on a ``RAJA::Layout`` object. We can
 create a ``RAJA::View`` for a matrix with dimensions :math:`N_r \times N_c` 
 using a RAJA View and a default RAJA two-dimensional Layout as follows::
 
@@ -52,7 +52,7 @@ extent of each matrix dimension as arguments. The template parameters to
 the ``RAJA::View`` type define the pointer type and the Layout type; here, 
 the Layout just defines the number of index dimensions. Using the resulting 
 view object, one may access matrix entries in a row-major fashion (the 
-default RAJA layout) through the View parenthesis operator::
+default RAJA layout) through the view *parenthesis operator*::
 
    // r - row index of a matrix
    // c - column index of a matrix
@@ -82,6 +82,54 @@ accesses array entries with unit stride. The loop::
 
 access array entries with stride N :subscript:`n` * N :subscript:`(n-1)` * ... * N :subscript:`(j+1)`.
 
+MultiView
+^^^^^^^^^^^^^^^^
+
+A ``RAJA::MultiView`` object wraps an array-of-pointers,
+or a pointer-to-pointers, whereas a ``RAJA::View`` wraps a single
+pointer or array. This allows a single ``RAJA::Layout`` to be applied to
+multiple arrays internal to the MultiView, allowing multiple arrays to share indexing
+arithmetic when their access patterns are the same.
+
+The instantiation of a MultiView works exactly like a standard View,
+except that it takes an array-of-pointers. In the following example, a MultiView
+applies a 1-D layout of length 4 to 2 internal arrays in ``myarr``.
+
+.. literalinclude:: ../../../../examples/multiview.cpp
+   :start-after: _multiview_example_1Dinit_start
+   :end-before: _multiview_example_1Dinit_end
+   :language: C++
+
+The default MultiView accesses internal arrays via the 0th position of the MultiView.
+
+.. literalinclude:: ../../../../examples/multiview.cpp
+   :start-after: _multiview_example_1Daccess_start
+   :end-before: _multiview_example_1Daccess_end
+   :language: C++
+
+The index into the array-of-pointers can be moved to different
+indices of the MultiView ``()`` access operator, rather than the default 0th position. By 
+passing a third template parameter to the MultiView constructor, the internal array index
+and the integer indicating which array to access can be reversed.
+
+.. literalinclude:: ../../../../examples/multiview.cpp
+   :start-after: _multiview_example_1Daopindex_start
+   :end-before: _multiview_example_1Daopindex_end
+   :language: C++
+
+As the number of Layout dimensions increases, the index into the array-of-pointers can be
+moved to more distinct locations in the MultiView ``()`` access operator. Here is an example
+which compares the accesses of a 2-D layout on a normal ``RAJA::View`` with a ``RAJA::MultiView``
+with the array-of-pointers index set to the 2nd position.
+ 
+.. literalinclude:: ../../../../examples/multiview.cpp
+   :start-after: _multiview_example_2Daopindex_start
+   :end-before: _multiview_example_2Daopindex_end
+   :language: C++
+
+.. note:: MultiView does not currently work with Layouts which use strongly
+          typed indices. It has not been tested yet with atomic accesses. 
+
 ------------
 RAJA Layouts
 ------------
@@ -90,7 +138,7 @@ RAJA Layouts
 striding orders, offsets, and permutations. In addition to layouts created
 using the default Layout constructor, as shown above, RAJA provides other 
 methods to generate layouts for different indexing patterns. We describe 
-these next.
+them here.
 
 Permuted Layout
 ^^^^^^^^^^^^^^^^
@@ -114,11 +162,12 @@ second index (index 1 - extent 7) has stride 55 (= 5*11).
 
 The first argument to ``RAJA::make_permuted_layout`` is a C++ array whose
 entries define the extent of each index dimension. **The double braces are 
-required to prevent compilation errors/warnings about issues trying to 
-initialize a sub-object.** The second argument is the striding permutation.
+required to properly initialize the internal sub-object which holds the
+extents.** The second argument is the striding permutation and similarly 
+requires double braces.
 
-In the next example, we create the same permuted layout, then create
-a ``RAJA::View`` with it in a way that tells the View which index has 
+In the next example, we create the same permuted layout as above, then create
+a ``RAJA::View`` with it in a way that tells the view which index has 
 unit stride::
 
   const int s0 = 5;  // extent of dimension 0
@@ -131,18 +180,19 @@ unit stride::
   RAJA::Layout<3> layout = 
     RAJA::make_permuted_layout( {{s0, s1, s2}}, perm );
 
-  // The Layout template parameters are dimension, 'linear index' type, 
-  // and the index with unit stride
-  RAJA::View<double, RAJA::Layout<3, RAJA::Index_type, 0> > Bview(B, layout);
+  // The Layout template parameters are dimension, 'linear index' type used
+  // when converting an index triple into the corresponding pointer offset
+  // index, and the index with unit stride
+  RAJA::View<double, RAJA::Layout<3, int, 0> > Bview(B, layout);
 
   // Equivalent to indexing as: B[i + j * s0 * s2 + k * s0]
   Bview(i, j, k) = ...; 
 
 .. note:: Telling a view which index has unit stride makes the 
           multi-dimensional index calculation more efficient by avoiding
-          multiplication by '1' when it is unnecessary. **This must be done
-          so that the layout permutation and unit-stride index specification
-          are the same to prevent incorrect indexing.**
+          multiplication by '1' when it is unnecessary. **The layout 
+          permutation and unit-stride index specification
+          must be consistent to prevent incorrect indexing.**
 
 Offset Layout
 ^^^^^^^^^^^^^^^^
@@ -164,9 +214,15 @@ it using indices in :math:`[-5, 5]`. In other words, one can use the loop::
   } 
 
 to initialize the values of the array. Each 'i' loop index value is converted
-to array offset access index by subtracting the lower offset to it; i.e., in 
+to an array offset index by subtracting the lower offset from it; i.e., in 
 the loop, each 'i' value has '-5' subtracted from it to properly access the
-array entry.
+array entry. That is, the sequence of indices generated by the for-loop::
+
+  -5 -4 -3 ... 5
+
+will index into the data array as::
+
+  0 1 2 ... 10
 
 The arguments to the ``RAJA::make_offset_layout`` method are C++ arrays that
 hold the start and end values of the indices. RAJA offset layouts support
@@ -177,9 +233,8 @@ any number of dimensions; for example::
 
 defines a two-dimensional layout that enables one to index into a view using 
 indices :math:`[-1, 2]` in the first dimension and indices :math:`[-5, 5]` in
-the second dimension. As we remarked earlier, double braces are needed to 
-prevent compilation errors/warnings about issues trying to initialize a 
-sub-object.
+the second dimension. As noted earlier, double braces are needed to 
+properly initialize the internal data in the layout object.
 
 Permuted Offset Layout
 ^^^^^^^^^^^^^^^^^^^^^^^^
@@ -195,13 +250,9 @@ indices. For example,::
 Here, the two-dimensional index space is :math:`[-1, 2] \times [-5, 5]`, the
 same as above. However, the index strides are permuted so that the first 
 index (index 0) has unit stride and the second index (index 1) has stride 4, 
-since the first index dimension has length 4.
+which is the extent of the first index (:math:`[-1, 2]`).
 
-Complete examples illustrating ``RAJA::Layouts`` and ``RAJA::Views``  may 
-be found in the :ref:`offset-label` and :ref:`permuted-layout-label`
-tutorial sections.
-
-.. note:: It is important to note some facts about RAJA Layout types. 
+.. note:: It is important to note some facts about RAJA layout types. 
           All layouts have a permutation. So a permuted layout and 
           a "non-permuted" layout (i.e., default permutation) has the 
           type ``RAJA::Layout``. Any layout with an offset has the 
@@ -210,14 +261,19 @@ tutorial sections.
           choice to avoid the overhead of offset computations in the 
           ``RAJA::View`` data access operator when they are not needed.
 
+Complete examples illustrating ``RAJA::Layouts`` and ``RAJA::Views``  may 
+be found in the :ref:`offset-label` and :ref:`permuted-layout-label`
+tutorial sections.
+
 Typed Layouts
 ^^^^^^^^^^^^^
 
 RAJA provides typed variants of ``RAJA::Layout`` and ``RAJA::OffsetLayout``
-enabling user specified index types. Basic usage requires specifying types for
-the linear index, and the multi-dimensional indicies. The following example creates
-typed layouts wherein the linear index is of type TIL and the multidimensional
-indices are TIX, TIY,::
+that enable users to specify integral index types. Usage requires 
+specifying types for the linear index and the multi-dimensional indicies. 
+The following example creates two two-dimensional typed layouts where the 
+linear index is of type TIL and the '(x, y)' indices for accesingg the data 
+have types TIX and TIY::
 
    RAJA_INDEX_VALUE(TIX, "TIX");
    RAJA_INDEX_VALUE(TIY, "TIY");
@@ -226,13 +282,18 @@ indices are TIX, TIY,::
    RAJA::TypedLayout<TIL, RAJA::tuple<TIX,TIY>> layout(10, 10);
    RAJA::TypedOffsetLayout<TIL, RAJA::tuple<TIX,TIY>> offLayout(10, 10);;
 
+.. note:: Using the ``RAJA_INDEX_VALUE`` macro to create typed indices
+          is helpful to prevent incorrect usage by detecting at compile
+          when, for example, indices are passes to a view parenthesis 
+          operator in the wrong order.
+
 Shifting Views
 ^^^^^^^^^^^^^^
 
-RAJA Views include a shift method enabling users to generate a new View with 
-offsets to the base View layout. The base View may be templated with either a 
-standard Layout, OffsetLayout and the typed variants. The generated View will 
-use an OffsetLayout or TypedOffsetLayout depending on whether the base 
+RAJA views include a shift method enabling users to generate a new view with 
+offsets to the base view layout. The base view may be templated with either a 
+standard layout or offset layout and their typed variants. The new view will 
+use an offset layout or typed offset layout depending on whether the base 
 view employed a typed layout. The example below illustrates shifting view 
 indices by :math:`N`, ::
 
@@ -264,17 +325,18 @@ three-dimensional index space to a one-dimensional linear space::
    RAJA::Layout<3> layout(5, 7, 11);
 
    // Map from 3-D index (2, 3, 1) to the linear index
-   // Note that there is no striding permutation, so rightmost is stride-1
+   // Note that there is no striding permutation, so the rightmost index is 
+   // stride-1
    int lin = layout(2, 3, 1); // lin = 188 (= 1 + 3 * 11 + 2 * 11 * 7)
 
    // Map from linear index to 3-D index
    int i, j, k;
    layout.toIndices(lin, i, j, k); // i,j,k = {2, 3, 1}
 
-``RAJA::Layout`` also supports *projections*, where one or more dimension
+RAJA layouts also support *projections*, where one or more dimension
 extent is zero. In this case, the linear index space is invariant for 
-those multi-dimensional index entries; thus, the 'toIndicies(...)' method 
-will always return zero for each dimension with zero extent. For example::
+those index entries; thus, the 'toIndicies(...)' method will always return 
+zero for each dimension with zero extent. For example::
 
    // Create a layout with second dimension extent zero
    RAJA::Layout<3> layout(3, 0, 5);
@@ -283,7 +345,7 @@ will always return zero for each dimension with zero extent. For example::
    int lin1 = layout(0, 10, 0);   // lin1 = 0
    int lin2 = layout(0, 5, 1);    // lin2 = 1
 
-   // The inverse mapping always produces a 0 for j
+   // The inverse mapping always produces zero for j
    int i,j,k;
    layout.toIndices(lin2, i, j, k); // i,j,k = {0, 0, 1}
 
@@ -311,7 +373,7 @@ way to do this in parallel using OpenMP and a RAJA atomic view::
   // Create a 1-dimensional view for histogram array
   RAJA::View<int, RAJA::Layout<1> > hist_view(hist_dat, M); 
 
-  // Create an atomic view for histogram array
+  // Create an atomic view into the histogram array using the view above
   auto hist_atomic_view = RAJA::make_atomic_view<ATOMIC_POL>(hist_view);
 
   RAJA::forall< EXEC_POL >(RAJA::RangeSegment(0, N), [=] (int i) {
@@ -321,16 +383,16 @@ way to do this in parallel using OpenMP and a RAJA atomic view::
 Here, we create a one-dimensional view for the histogram data array. Then,
 we create an atomic view from that, which we use in the RAJA loop to 
 compute the histogram entries. Since the view is atomic, only one OpenMP
-thread can write to each entry at a time.
+thread can write to each array entry at a time.
 
 ------------------------------------
 RAJA View/Layouts Bounds Checking
 ------------------------------------
 
 The RAJA CMake variable ``RAJA_ENABLE_BOUNDS_CHECK`` may be used to turn on/off 
-runtime bounds checking for RAJA Views. This may be a useful debugging aid for
-users. When bounds checkoing is turned off (default case), there is no 
-additional run time overhead incurred. Bounds checking is accomplished within
-RAJA layouts (both offset and standard layouts). Upon an out of bounds error, 
-RAJA will abort the program and print the index that is out of bounds as
-well the value of the index and bounds.
+runtime bounds checking for RAJA views. This may be a useful debugging aid for
+users. When attempting to use an index value that is out of bounds,
+RAJA will abort the program and print the index that is out of bounds and
+the value of the index and bounds for it. Since the bounds checking is a runtime
+operation, it incurs non-negligible overhead. When bounds checkoing is turned 
+off (default case), there is no additional run time overhead incurred. 
diff --git a/docs/sphinx/user_guide/feature/workgroup.rst b/docs/sphinx/user_guide/feature/workgroup.rst
new file mode 100644
index 0000000000..4a89e5b3a2
--- /dev/null
+++ b/docs/sphinx/user_guide/feature/workgroup.rst
@@ -0,0 +1,303 @@
+.. ##
+.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+.. ## and other RAJA project contributors. See the RAJA/COPYRIGHT file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _workgroup-label:
+
+=========
+WorkGroup
+=========
+
+In this section, we describe the basics of RAJA workgroups.
+``RAJA::WorkPool``, ``RAJA::WorkGroup``, and ``RAJA::WorkSite`` class templates comprise the
+RAJA interface for grouped loop execution. ``RAJA::WorkPool`` takes a set  of simple
+loops (e.g., non-nested loops) and instantiates a ``RAJA::WorkGroup``. ``RAJA::WorkGroup``
+represents an executable form of those loops and when run makes a ``RAJA::WorkSite``.
+``RAJA::WorkSite`` holds all of the resources used for a single run of the loops. Be aware
+that the RAJA workgroup constructs API is still being developed and may change in later RAJA
+releases.
+
+.. note:: * All **workgroup** constructs are in the namespace ``RAJA``.
+          * The ``RAJA::WorkPool``, ``RAJA::WorkGroup``, and ``RAJA::WorkSite`` class templates
+            are templated on:
+              * a WorkGroup policy which is composed of:
+                  * a work execution policy.
+                  * a work ordering policy.
+                  * a work storage policy.
+              * an index type that is the first argument to the loop bodies.
+              * a list of extra argument types that are the rest of the arguments to
+                the loop bodies.
+              * an allocator type to be used for the memory used to store and
+                manage the loop bodies.
+          * The ``RAJA::WorkPool::enqueue`` method takes two arguments:
+              * an iteration space object, and
+              * a lambda expression representing the loop body.
+
+Examples showing how to use RAJA workgroup methods may be found in
+the :ref:`tutorial-label`.
+
+For more information on RAJA work policies and iteration space constructs,
+see :ref:`policies-label` and :ref:`index-label`, respectively.
+
+.. _workgroup-Policies-label:
+
+--------
+Policies
+--------
+
+The behavior of the RAJA workgroup constructs is determined by a policy.
+The ``RAJA::WorkGroupPolicy`` has three components, a work execution policy,
+a work ordering policy, and a work storage policy. ``RAJA::WorkPool``,
+``RAJA::WorkGroup``, and ``RAJA::WorkSite`` class templates all
+take the same policy and template arguments.  For example::
+
+  using workgroup_policy = RAJA::WorkGroupPolicy <
+                               RAJA::seq_work,
+                               RAJA::ordered,
+                               RAJA::ragged_array_of_objects >;
+
+is a workgroup policy that will run loops sequentially on the host in the order
+they were enqueued and store the loop bodies sequentially in single buffer in
+memory.
+
+The work execution policy acts like the execution policies used with ``RAJA::forall``
+and determines the backend used to run the loops and the parallelism within each
+loop.
+
+ ====================================== ========================================
+ Work Execution Policies                Brief description
+ ====================================== ========================================
+ seq_work                               Execute loop iterations strictly
+                                        sequentially.
+ simd_work                              Execute loop iterations sequentially and
+                                        try to force generation of SIMD
+                                        instructions via compiler hints in RAJA
+                                        internal implementation.
+ loop_work                              Execute loop iterations sequentially and
+                                        allow compiler to generate any
+                                        optimizations.
+ omp_work                               Execute loop iterations in parallel
+                                        using OpenMP.
+ tbb_work                               Execute loop iterations in parallel
+                                        using TBB.
+ cuda_work<BLOCK_SIZE>,                 Execute loop iterations in parallel
+ cuda_work_async<BLOCK_SISZE>           using a CUDA kernel launched with given
+                                        thread-block size.
+ omp_target_work                        Execute loop iterations in parallel
+                                        using OpenMP target.
+ ====================================== ========================================
+
+The work ordering policy acts like the segment iteration execution policies when
+``RAJA::forall`` is used with a ``RAJA::IndexSet`` and determines the backend
+used when iterating over the loops and the parallelism between each loop.
+
+ ====================================== ========================================
+ Work Execution Policies                Brief description
+ ====================================== ========================================
+ ordered                                Execute loops sequentially in the order
+                                        they were enqueued using forall.
+ reverse_ordered                        Execute loops sequentially in the
+                                        reverse of the order order they were
+                                        enqueued using forall.
+ unordered_cuda_loop_y_block_iter_x_threadblock_average
+                                        Execute loops in parallel by mapping
+                                        each loop to a set of cuda blocks with
+                                        the same index in the y direction in
+                                        a cuda kernel. Each loop is given a
+                                        number of threads over one of more
+                                        blocks in the x direction equal to the
+                                        average number of iterations of all the
+                                        loops rounded up to a multiple of the
+                                        block size.
+ ====================================== ========================================
+
+The work storage policy determines the strategy used to allocate and layout the
+storage used to store the ranges, loop bodies, and other data necessary to
+implement the workstorage constructs.
+
+ ====================================== ========================================
+ Work Storage Policies                  Brief description
+ ====================================== ========================================
+ array_of_pointers                      Store loop data in individual
+                                        allocations and keep an array of
+                                        pointers to the individual loop data
+                                        allocations.
+ ragged_array_of_objects                Store loops sequentially in a single
+                                        allocation, reallocating and moving the
+                                        loop data items as needed, and keep an
+                                        array of offsets to the individual loop
+                                        data items.
+ constant_stride_array_of_objects       Store loops sequentially in a single
+                                        allocation with a consistent stride
+                                        between loop data items, reallocating
+                                        and/or changing the stride and moving
+                                        the loop  data items as needed.
+ ====================================== ========================================
+
+
+.. _workgroup-Arguments-label:
+
+---------
+Arguments
+---------
+
+The next two template arguments to the workgroup constructs determine the
+call signature of the loop bodies that may be added to the workgroup. The first
+is an index type which is the first parameter in the call signature. Next is a
+list of types called ``RAJA::xargs``, short for extra arguments, that gives the
+rest of the types of the parameters in the call signature. The values of the
+extra arguments are passed in when the loops are run, see :ref:`workgroup-WorkGroup-label`.
+For example::
+
+  int, RAJA::xargs<>
+
+can be used with lambdas with the following signature::
+
+  [=](int) { ... }
+
+and::
+
+  int, RAJA::xargs<int*, double>
+
+can be used with lambdas with the following signature::
+
+  [=](int, int*, double) { ... }
+
+
+.. _workgroup-Allocators-label:
+
+----------
+Allocators
+----------
+
+The last template argument to the workgroup constructs is an allocator type
+that conforms to the allocator named requirement used in the standard library.
+This gives you control over how memory is allocated, for example with umpire,
+and what memory space is used, both of which have poerformance implications.
+Find the requirements for allocator types along with a simple example here
+https://en.cppreference.com/w/cpp/named_req/Allocator. The default allocator
+used by the standard template library may be used with ordered and non-GPU
+policies::
+
+  using Allocator = std::allocator<char>;
+
+.. note:: * The allocator type must use template argument char.
+          * Allocators must provide memory that is accessible where it is used.
+              * Ordered work order policies only require memory that is accessible
+                where loop bodies are enqueued.
+              * Unordered work order policies require memory that is accessible
+                from both where the loop bodies are enqueued and from where the
+                loop is executed based on the work execution policy.
+                  * For example when using cuda work exeution policies with cuda
+                    unordered work order policies pinned memory is a good choice
+                    because it is always accessible on the host and device.
+
+
+.. _workgroup-WorkPool-label:
+
+--------
+WorkPool
+--------
+
+The ``RAJA::WorkPool`` class template holds a set of simple (e.g., non-nested)
+loops that are enqueued one at a time. For example, to enqueue a C-style loop
+that adds two vectors, like::
+
+  for (int i = 0; i < N; ++i) {
+    c[i] = a[i] + b[i];
+  }
+
+is as simple as calling enqueue on a ``RAJA::WorkPool`` object and passing the
+same arguments you would pass to ``RAJA::forall``.::
+
+  using WorkPool_type = RAJA::WorkPool< workgroup_policy,
+                                        int, RAJA::xargs<>,
+                                        Allocator >;
+  WorkPool_type workpool(Allocator{});
+
+  workpool.enqueue(RAJA::RangeSegment(0, N), [=] (int i) {
+    c[i] = a[i] + b[i];
+  });
+
+Note that WorkPool may have to allocate and reallocate multiple times to store
+a set of loops depending on the work storage policy. Reallocation can be avoided
+by reserving enough memory before adding any loops.::
+
+  workpool.reserve(num_loops, storage_bytes);
+
+Here ``num_loops`` is the number of loops to allocate space for and
+``num_storage_bytes`` is the amount of storage to allocate. These may be used
+differently depending on the work storage policy. The number of loops
+enqueued in a ``RAJA::WorkPool`` and the amount of storage used may be queried
+using::
+
+  size_t num_loops     = workpool.num_loops();
+  size_t storage_bytes = workpool.storage_bytes();
+
+Storage will automatically reserved when reusing a `RAJA::WorkPool`` object
+based on the maximum seen values for num_loops and storage_bytes.
+
+When you've added all the loops you want to the set, you can call instantiate
+on the ``RAJA::WorkPool`` to generate a ``RAJA::WorkGroup``.::
+
+  WorkGroup_type workgroup = workpool.instantiate();
+
+.. _workgroup-WorkGroup-label:
+
+---------
+WorkGroup
+---------
+
+The ``RAJA::WorkGroup`` class template is responsible for hanging onto the set
+of loops and running the loops. The ``RAJA::WorkGroup`` owns its loops and must
+not be destroyed before any loops run asynchronously using it have completed.
+It is instantiated from a ``RAJA::WorkPool`` object which transfers ownership
+of a set of loops to the ``RAJA::WorkGroup`` and prepares the loops to be run.
+For example::
+
+  using WorkGroup_type = RAJA::WorkGroup< workgroup_policy,
+                                          int, RAJA::xargs<>,
+                                          Allocator >;
+  WorkGroup_type workgroup = workpool.instantiate();
+
+creates a ``RAJA::WorkGroup`` ``workgroup`` from the loops in ``workpool`` and
+leaves ``workpool`` empty and ready for reuse. When you want to run the loops
+simply call run on ``workgroup`` and pass in the extra arguments::
+
+  WorkSite_type worksite = workgroup.run();
+
+In this case no extra arguments were passed to run because the ``RAJA::WorkGroup``
+specified no extra arguments ``RAJA::xargs<>``. Passing extra arguments when the
+loops are run lets you delay creation of those arguments until you plan to run
+the loops. This lets the value of the arguments depend on the loops in the set.
+A simple example of this may be found in the tutorial here :ref:`tutorial-label`.
+Run produces a ``RAJA::WorkSite`` object.
+
+
+.. _workgroup-WorkSite-label:
+
+--------
+WorkSite
+--------
+
+The ``RAJA::WorkSite`` class template is responsible for extending the lifespan
+of objects used when running loops asynchronously. This means that the
+``RAJA::WorkSite`` object must remain alive until the call to run has been
+synchronized. For example the scoping here::
+
+  {
+    using WorkSite_type = RAJA::WorkSite< workgroup_policy,
+                                          int, RAJA::xargs<>,
+                                          Allocator >;
+    WorkSite_type worksite = workgroup.run();
+
+    // do other things
+
+    synchronize();
+  }
+
+ensures that ``worksite`` survives until after synchronize is called.
diff --git a/docs/sphinx/user_guide/features.rst b/docs/sphinx/user_guide/features.rst
index 608c90bec2..8b9caf6b27 100644
--- a/docs/sphinx/user_guide/features.rst
+++ b/docs/sphinx/user_guide/features.rst
@@ -12,7 +12,7 @@
 RAJA Features
 ************************
 
-The following sections describe key aspects of the main RAJA features. 
+The following sections describe key aspects of the main RAJA features.
 
 .. toctree::
    :maxdepth: 2
@@ -22,7 +22,11 @@ The following sections describe key aspects of the main RAJA features.
    feature/iteration_spaces
    feature/view
    feature/reduction
+   feature/resource
    feature/atomic
    feature/scan
+   feature/sort
    feature/local_array
    feature/tiling
+   feature/plugins
+   feature/workgroup
\ No newline at end of file
diff --git a/docs/sphinx/user_guide/getting_started.rst b/docs/sphinx/user_guide/getting_started.rst
index 535301fc89..c48705b2a6 100644
--- a/docs/sphinx/user_guide/getting_started.rst
+++ b/docs/sphinx/user_guide/getting_started.rst
@@ -22,8 +22,8 @@ Requirements
 The primary requirement for using RAJA is a C++11 compliant compiler.
 Accessing various programming model back-ends requires that they be supported
 by the compiler you chose. Available options and how to enable or disable 
-them are described in :ref:`configopt-label`. To build and use RAJA in its 
-simplest form requires:
+them are described in :ref:`configopt-label`. To build RAJA in its most basic
+form and use its simplest features:
 
 - C++ compiler with C++11 support
 - `CMake <https://cmake.org/>`_ version 3.9 or greater.
@@ -39,13 +39,13 @@ the command::
 
    $ git clone --recursive https://github.com/LLNL/RAJA.git
 
-The ``--recursive`` argument above is needed to pull in other projects
-RAJA depends on as Git *submodules*. Currently, RAJA submodule dependencies 
-are:
+The ``--recursive`` argument above is needed to pull in necessary RAJA
+dependencies as Git *submodules*. Current RAJA dependencies are:
 
 - `BLT build system <https://github.com/LLNL/blt>`_
-- `Camp portable utility library <https://github.com/LLNL/camp>`_
-- `NVIDIA CUB <https://github.com/NVlabs/cub>`_
+- `Camp compiler agnostic metaprogramming library  <https://github.com/LLNL/camp>`_
+- `CUB CUDA utilities library <https://github.com/NVlabs/cub>`_
+- `rocPRIM Hip parallel primitives library <https://github.com/ROCmSoftwarePlatform/rocPRIM.git>`_
 
 You probably don't need to know much about these other projects to start
 using RAJA. But, if you want to know more about them, click on the links above.
@@ -54,7 +54,7 @@ After running the clone command, a copy of the RAJA repository will reside in
 a ``RAJA`` subdirectory where you ran the clone command. You will be on the 
 ``develop`` branch of RAJA, which is our default branch.
 
-If you forget to pass the ``--recursive`` argument to the ``git clone``
+If you do not pass the ``--recursive`` argument to the ``git clone``
 command, you can type the following commands after cloning::
 
   $ cd RAJA
@@ -72,57 +72,65 @@ Build and Install
 ==================
 
 Building and installing RAJA can be very easy or more complicated, depending
-on which features you want to use and how well you understand how to use
-your system.
+on which features you want to use and how easy it is to use your system.
 
 --------------
 Building RAJA
 --------------
 
-RAJA uses CMake to configure a build. A basic configuration looks like::
+RAJA uses CMake to configure a build. A "bare bones" configuration looks like::
 
   $ mkdir build-dir && cd build-dir
   $ cmake -DCMAKE_INSTALL_PREFIX=/path/to/install ../
 
 .. note:: * RAJA requires a minimum CMake version of 3.9.
           * Builds must be *out-of-source*.  RAJA does not allow building in
-            the source directory, so you must create a build directory.
+            the source directory, so you must create a build directory and
+            run CMake in it.
 
-When you run CMake, it will provide output about the compiler that has been 
-found and which features are discovered. Some RAJA features, like OpenMP 
-support are enabled if they are discovered. For a complete summary of 
-configuration options, please see :ref:`configopt-label`.
+When you run CMake, it will generate output about the build environment 
+(compiler and version, options, etc.). Some RAJA features, 
+like OpenMP support are enabled by default if, for example, the compiler 
+supports OpenMP. These can be disabled if desired. For a summary of 
+RAJA configuration options, please see :ref:`configopt-label`.
 
 After CMake successfully completes, you compile RAJA by executing the ``make``
 command in the build directory; i.e.,::
 
-  $ cd build-dir
   $ make
 
-If you have access to a multi-core system you can compile in parallel by running
-``make -j`` (to build with all available cores) or ``make -j N`` to build using
-N cores.
+If you have access to a multi-core system, you can compile in parallel by 
+running ``make -j`` (to build with all available cores) or ``make -j N`` to 
+build using N cores.
 
-.. note:: RAJA is configured to build its unit tests by default. If you do not
-          disable them with the appropriate CMake option, you can run them
-          after the build completes to check if everything compiled properly.
-          The easiest way to do this is to type::
+.. note:: * RAJA is configured to build its unit tests by default. If you do not
+            disable them with the appropriate CMake option (please see
+            :ref:`configopt-label`), you can run them after the build completes
+            to check if everything is built properly.
 
-          $ make test
+            The easiest way to run the full set of RAJA tests is to type::
 
-          after the build completes.
+               $ make test
 
-          You can also run individual tests by invoking individual test 
-          executables directly. They live in subdirectories in the ``test`` 
-          directory. RAJA tests use the 
-          `Google Test framework <https://github.com/google/googletest>`_, 
-          so you can also run tests via Google Test commands.
+            in the build directory after the build completes.
 
-          It is very important to note that the version of Googletest that
-          is used in RAJA version v0.11.0 or newer requires CUDA version 
-          9.2.x or newer when compiling with nvcc. Thus, if you build
-          RAJA with CUDA enabled and want to also enable RAJA tests, you
-          must use CUDA version 9.2.x or newer.
+            You can also run individual tests by invoking test 
+            executables directly. They will be located in the ``test`` 
+            subdirectory in the build space directory. RAJA tests use the 
+            `Google Test framework <https://github.com/google/googletest>`_, 
+            so you can also run tests via Google Test commands.
+
+          * RAJA also contains example and tutorial exercise 
+            programs you can run if you wish. Similar to the RAJA tests, 
+            the examples and exercises are built by default and can be
+            disabled with CMake options (see :ref:`configopt-label`). The 
+            source files for these are located in the ``RAJA/examples`` and 
+            ``RAJA/exercises`` directories, respectively. When built, the
+            executables for the examples and exercises will be located in
+            the ``bin`` subdirectory in the build space directory. Feel free to 
+            experiment by editing the source files and recompiling.
+
+.. _build-external-tpl-label:
 
 .. note:: You may use externally-supplied versions of the camp and cub 
           libraries with RAJA if you wish. To do so, pass the following 
@@ -130,6 +138,102 @@ N cores.
             * External camp: -DEXTERNAL_CAMP_SOURCE_DIR=<camp dir name>
             * External cub: -DENABLE_EXTERNAL_CUB=On -DCUB_DIR=<cub dir name> 
 
+-----------------
+GPU Builds, etc.
+-----------------
+
+CUDA
+^^^^^^
+
+To run RAJA code on NVIDIA GPUs, one typically must have a CUDA compiler 
+installed on your system, in addition to a host code compiler. You may need 
+to specify both when you run CMake. The host compiler is specified using the 
+``CMAKE_CXX_COMPILER`` CMake variable. The CUDA compiler is specified with
+the ``CMAKE_CUDA_COMPILER`` variable.
+
+When using the NVIDIA nvcc compiler for RAJA CUDA functionality, the variables:
+
+  * CMAKE_CUDA_FLAGS_RELEASE
+  * CMAKE_CUDA_FLAGS_DEBUG
+  * CMAKE_CUDA_FLAGS_RELWITHDEBINFO
+
+which corresponding to the standard CMake build types are used to pass flags
+to nvcc.
+
+.. note:: When nvcc must pass options to the host compiler, the arguments
+          can be included using these CMake variables. Host compiler
+          options must be prepended with the `-Xcompiler` directive.
+
+To set the CUDA compute architecture for the nvcc compiler, which should be
+chosen based on the NVIDIA GPU hardware you are using, you can use the
+``CUDA_ARCH`` CMake variable. For example, the CMake option::
+
+  -DCUDA_ARCH=sm_60
+
+will tell the compiler to use the `sm_60` SASS architecture in its second
+stage of compilation. It will pick the PTX architecture to use in the first
+stage of compilation that is suitable for the SASS architecture you specify.
+
+Alternatively, you may specify the PTX and SASS architectures, using
+appropriate nvcc options in the ``CMAKE_CUDA_FLAGS_*`` variables.
+
+.. note:: **RAJA requires a minimum CUDA architecture level of `sm_35` to use
+          all supported CUDA features.** Mostly, the architecture level affects
+          which RAJA CUDA atomic operations are available and how they are
+          implemented inside RAJA. This is described in :ref:`atomics-label`.
+
+          * If you do not specify a value for ``CUDA_ARCH``, it will be set to
+            `sm_35` by default and CMake will emit a status message 
+            indicatting this choice was made.
+
+          * If you give a ``CUDA_ARCH`` value less than `sm_35` (e.g., `sm_30`),
+            CMake will report this and stop processing.
+
+Also, RAJA relies on the CUB CUDA utilities library for some CUDA functionality.
+CUB is included with RAJA as a Git submodule and this version will be used if
+you do not specify an alternative. To use an externally-supplied CUB library,
+provide the following options to CMake: 
+``-DENABLE_EXTERNAL_CUB=On -DCUB_DIR=<pat/to/cub>``.
+
+.. note:: It is important to note that the version of Googletest that
+          is used in RAJA version v0.11.0 or newer requires CUDA version 
+          9.2.x or newer when compiling with nvcc. Thus, if you build
+          RAJA with CUDA enabled and want to also enable RAJA tests, you
+          must use CUDA version 9.2.x or newer.
+
+Hip
+^^^^
+
+To run RAJA code on AMD GPUs, one typically uses the Hip compiler and tool 
+chain (which can also be used to compile code for NVIDIA GPUs).
+
+.. note:: RAJA requires version 3.5 or newer of the rocm software stack to 
+          use the RAJA Hip back-end.
+
+OpenMP
+^^^^^^^
+
+To use OpenMP target offlad GPU execution, additional options may need to be
+passed to the compiler. The variable ``OpenMP_CXX_FLAGS`` is used for this.
+Option syntax follows the CMake *list* pattern. For example, to specify OpenMP 
+target options for NVIDIA GPUs using a clang-based compiler, one may do
+something like::
+
+   cmake \
+     ....
+     -DOpenMP_CXX_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda"
+
+----------------------------------------
+RAJA Example Build Configuration Files
+----------------------------------------
+
+The ``RAJA/scripts`` directory contains subdirectories with a variety of
+build scripts we use to build and test RAJA on various platforms with
+various compilers. These scripts pass files (*CMake cache files*) located in
+the ``RAJA/host-configs`` directory to CMake using the '-C' option.
+These files serve as useful examples of how to configure RAJA prior to
+compilation.
+
 ----------------
 Installing RAJA
 ----------------
diff --git a/docs/sphinx/user_guide/index.rst b/docs/sphinx/user_guide/index.rst
index fef1c354a1..25eaf980f9 100644
--- a/docs/sphinx/user_guide/index.rst
+++ b/docs/sphinx/user_guide/index.rst
@@ -13,8 +13,8 @@ RAJA User Guide
 
 RAJA is a software library of C++ abstractions, developed at Lawrence Livermore
 National Laboratory (LLNL), that enable architecture and programming model
-portability for high performance computing (HPC) applications. RAJA has two 
-main goals: 
+portability for high performance computing (HPC) applications. RAJA has two
+main goals:
 
 #. To enable application portability with manageable disruption to existing algorithms and programming styles.
 #. To achieve performance comparable to using common programming models (e.g., OpenMP, CUDA, etc.) directly.
@@ -26,7 +26,7 @@ that extend the generally-accepted *parallel for* idiom.
 Background and Motivation
 =============================
 
-Many HPC applications must achieve high performance across a diverse 
+Many HPC applications must achieve high performance across a diverse
 range of computer architectures including: Mac and Windows laptops,
 parallel clusters of multicore commodity processors, and large-scale 
 supercomputers with advanced heterogeneous node architectures that combine 
@@ -39,37 +39,47 @@ have been made in highly-scalable MPI-only applications that have been in
 service over multiple platform generations. Often, maintaining developer and 
 user productivity requires the ability to build single-source application 
 source code bases that can be readily ported to new architectures. RAJA is 
-one C++-based programming model abstraction layer that can help to meet this 
-performance portability challenge.
+one C++ abstraction layer that helps address this performance portability 
+challenge.
 
 RAJA provides portable abstractions for simple and complex loops -- as well 
-as a variety of loop transformations, reductions, scans, atomic operations, 
-data layouts and views, iteration spaces, etc. Currently available execution
-patterns supported by different programming model back-ends include: 
-sequential, 
-`SIMD <https://en.wikipedia.org/wiki/SIMD>`_, 
-`NVIDIA CUDA <https://developer.nvidia.com/about-cuda>`_, 
-`OpenMP <https://www.openmp.org>`_ CPU multi-threading and target offload. 
-OpenMP target offload support is incomplete and should be considered 
-experimental. Support for `Intel Threading Building Blocks (TBB) <https://www.threadingbuildingblocks.org>`_ and `AMD HIP <https://gpuopen.com/compute-product/hip-convert-cuda-to-portable-c-code/>`_ support are also under development.
+reductions, scans, atomic operations, sorts, data layouts, views, and loop 
+iteration spaces, as well as compile-time loop transformations. Features
+are continually growing as new use cases arise due to expanding user adoption.
 
-RAJA uses standard C++11 -- C++ is the predominant programming language in
-many LLNL applications. RAJA requirements and design are rooted in a 
+RAJA uses standard C++11 -- C++ is the programming language model of choice
+for many HPC applications. RAJA requirements and design are rooted in a 
 decades of developer experience working on production mesh-based 
-multiphysics applications at LLNL. An important RAJA requirement is that
-application developers can specialize RAJA concepts for different code 
-implementation patterns and C++ usage, since data structures and algorithms 
+multiphysics applications. An important RAJA requirement is that
+application developers can specialize RAJA concepts for different code
+implementation patterns and C++ usage, since data structures and algorithms
 vary widely across applications.
 
 RAJA helps developers insulate application loop kernels from underlying 
 architecture and programming model-specific implementation details. Loop 
 bodies and loop execution are decoupled using C++ lambda expressions 
 (loop bodies) and C++ templates (loop execution methods). This approach 
-promotes the perspective that developers should focus on tuning 
+promotes the perspective that application developers should focus on tuning 
 loop patterns rather than individual loops as much as possible. RAJA makes it 
 relatively straightforward to parameterize an application using execution 
 policy types so that it can be compiled in a specific configuration suitable 
-to a given architecture. 
+to a given architecture.
+
+RAJA support for various execution back-ends is the result of collaborative
+development between the RAJA team and academic and industrial partners.
+Currently available execution back-ends include: 
+sequential, 
+`SIMD <https://en.wikipedia.org/wiki/SIMD>`_, 
+`Threading Building Blocks (TBB) <https://github.com/oneapi-src/oneTBB>`_,
+`NVIDIA CUDA <https://developer.nvidia.com/about-cuda>`_, 
+`OpenMP <https://www.openmp.org>`_ CPU multithreading and target offload, and
+`AMD HIP <https://github.com/ROCm-Developer-Tools/HIP>`_. Sequential,
+CUDA, OpenMP CPU multithreading, and HIP execution are supported for all
+RAJA features. Sequential, OpenMP CPU multithreading, and CUDA
+are considered the most developed at this point as these have been our primary
+focus up to now. Those back-ends are used in a wide variety of production 
+applications. OpenMP target offload and TBB back-ends do not support
+all RAJA features and should be considered experimental.
 
 ================================
 Interacting with the RAJA Team
@@ -77,29 +87,29 @@ Interacting with the RAJA Team
 
 If you are interested in keeping up with RAJA development and communicating
 with developers and users, please join our `Google Group
-<https://groups.google.com/forum/#!forum/raja-users>`_, or contact the 
+<https://groups.google.com/forum/#!forum/raja-users>`_, or contact the
 development team via email at ``raja-dev@llnl.gov``
 
 If you have questions, find a bug, have ideas about expanding the
 functionality or applicability, or wish to contribute
 to RAJA development, please do not hesitate to contact us. We are always
-interested in improving RAJA and exploring new ways to use it. A brief 
-description of how the RAJA team operates can be found in 
+interested in improving RAJA and exploring new ways to use it. A brief
+description of how the RAJA team operates can be found in
 :ref:`contributing-label`.
 
 =============================
 What's In This Guide?
 =============================
 
-If you have some familiarity with RAJA and want to get up and running quickly, 
-check out :ref:`getting_started-label`. This guide contains information 
+If you have some familiarity with RAJA and want to get up and running quickly,
+check out :ref:`getting_started-label`. This guide contains information
 about accessing the RAJA code, building it, and basic RAJA usage.
 
 If you are completely new to RAJA, please check out the :ref:`tutorial-label`.
-It contains a discussion of essential C++ concepts and will walk you 
+It contains a discussion of essential C++ concepts and will walk you
 through a sequence of code examples that show how to use key RAJA features.
 
-See :ref:`features-label` for a complete, high-level description of RAJA 
+See :ref:`features-label` for a complete, high-level description of RAJA
 features (like a reference guide).
 
 Additional information about things to think about when considering whether
@@ -116,4 +126,5 @@ to use RAJA in an application can be found in :ref:`app-considerations-label`.
    config_options
    plugins
    contributing
+   developer_guide
    raja_license
diff --git a/docs/sphinx/user_guide/plugins.rst b/docs/sphinx/user_guide/plugins.rst
index df603e9702..dabe14acb5 100644
--- a/docs/sphinx/user_guide/plugins.rst
+++ b/docs/sphinx/user_guide/plugins.rst
@@ -29,7 +29,7 @@ that kernel executes, CHAI will make the data available.
 
 To build CHAI with RAJA integration, you need to download and install CHAI with
 the ``ENABLE_RAJA_PLUGIN`` option turned on.  Please see the `CHAI project
-<https://github.com/LLNL/CHAI>` for details
+<https://github.com/LLNL/CHAI>`_ for details
 
 After CHAI has been build with RAJA support enabled, applications can use CHAI
 ``ManangedArray`` objects to access data inside a RAJA kernel; for example,::
diff --git a/docs/sphinx/user_guide/tutorial.rst b/docs/sphinx/user_guide/tutorial.rst
index 9c59792023..6b9d929ebc 100644
--- a/docs/sphinx/user_guide/tutorial.rst
+++ b/docs/sphinx/user_guide/tutorial.rst
@@ -14,7 +14,9 @@ RAJA Tutorial
 
 This RAJA tutorial introduces RAJA concepts and capabilities via a 
 sequence of examples of increasing complexity. Complete working codes for 
-the examples are located in the ``RAJA``examples`` directory.
+the examples are located in the ``RAJA``examples`` directory. The RAJA 
+tutorial evolves as we add new features to RAJA, so refer to it periodically
+if you are interested in learning about them.
 
 To understand the discussion and code examples, a working knowledge of C++ 
 templates and lambda expressions is required. So, before we begin, we provide 
@@ -27,27 +29,26 @@ transfers between those memory spaces work. For a detailed discussion, see
 `Device Memory <http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory>`_. 
 
 RAJA does not provide a memory model. This is by design as developers of many
-of the production applications for which RAJA is targeted prefer to manage
-memory themselves. Thus, users are responsible for ensuring that data is 
-properly allocated and initialized on a GPU device when running GPU code. 
-This can be done using explicit host and device allocation and copying 
-between host and device memory spaces or via CUDA unified memory (UM), if 
-available. RAJA developers also support a library called 
-`CHAI <https://github.com/LLNL/CHAI>`_ which complements RAJA by providing 
-a simple alternative to manual CUDA calls or UM. For more 
-information, see :ref:`plugins-label`.
+of applications that use RAJA prefer to manage memory themselves. Thus, users 
+are responsible for ensuring that data is properly allocated and initialized 
+on a GPU device when running GPU code. This can be done using explicit host 
+and device allocation and copying between host and device memory spaces or via 
+unified memory (UM), if available. RAJA developers also support a library 
+called `CHAI <https://github.com/LLNL/CHAI>`_ which complements RAJA by 
+providing a alternative to manual host-device memory copy calls or UM. 
+For more information, see :ref:`plugins-label`.
 
 .. _tutorial-lambda-label:
 
 ===============================
-A Little C++ Lambda Background
+A Little C++ Background
 ===============================
 
 RAJA makes heavy use of C++ templates and using RAJA most easily and 
 effectively is done by representing the bodies of loop kernels as C++ lambda 
-expressions. Alternatively, C++ functors can be used, but we don't recommend 
-them as they make application source code more complex, potentially placing 
-a significant negative burden on source code readability and maintainability.
+expressions. Alternatively, C++ factors can be used, but they make 
+application source code more complex, potentially placing a significant 
+negative burden on source code readability and maintainability.
 
 -----------------------------------
 C++ Templates
@@ -65,15 +66,16 @@ template method defined as::
      ...
   }
 
-Here, "ExecPol", "IdxType", "LoopBody" are C++ types you, as a user, specify at 
-compile-time, like this::
+Here, "ExecPol", "IdxType", and "LoopBody" are C++ types a user specifies in
+their code; for example::
 
-  forall< RAJA::seq_exec >( RAJA::RangeSegment(0, N), [=](int i) {
+  RAJA::forall< RAJA::seq_exec >( RAJA::RangeSegment(0, N), [=](int i) {
     a[i] = b[i] + c[i];
   });
 
-The "IdxTypes" and "LoopBody" types are deduced by the compiler based on what 
-you specify. Here, the loop body type is defined by the lambda expression::
+The "IdxType" and "LoopBody" types are deduced by the compiler based on what 
+arguments are passed to the ``RAJA::forall`` method. Here, the loop body type 
+is defined by the lambda expression::
 
   [=](int i) { a[i] = b[i] + c[i]; }
 
@@ -86,8 +88,8 @@ expressions. A more technical and detailed discussion is available here:
 `Lambda Functions in C++11 - the Definitive Guide <https://www.cprogramming.com/c++11/c++11-lambda-closures.html>`_ 
 
 Lambda expressions were introduced in C++ 11 to provide a lexical-scoped 
-name binding; that is, a *closure* that stores a function with a data 
-environment. In particular, a lambda expression can *capture* variables from an 
+name binding; specifically, a *closure* that stores a function with a data 
+environment. That is, a lambda expression can *capture* variables from an 
 enclosing scope for use within the local scope of the function expression.
 
 A C++ lambda expression has the following form::
@@ -97,16 +99,16 @@ A C++ lambda expression has the following form::
 The ``capture list`` specifies how variables outside the lambda scope are pulled
 into the lambda data environment. The ``parameter list`` defines arguments 
 passed to the lambda function body -- for the most part, lambda arguments
-are just like arguments to a standard C++ method. Variables in the capture list 
+are just like arguments in a regular C++ method. Variables in the capture list 
 are initialized when the lambda expression is created, while those in the 
 parameter list are set when the lambda expression is called. The body of a 
 lambda expression is similar to the body of an ordinary C++ method.
 RAJA templates, such as ``RAJA::forall`` and ``RAJA::kernel`` pass arguments 
-to lambdas based on usage and context; typically, these are loop indices.
+to lambdas based on usage and context; e.g., loop iteration indices.
 
 A C++ lambda expression can capture variables in the capture list by value 
 or by reference. This is similar to how arguments to C++ methods are passed; 
-e.g., pass-by-reference or pass-by-value. However, there are some subtle 
+i.e., *pass-by-reference* or *pass-by-value*. However, there are some subtle 
 differences between lambda variable capture rules and those for ordinary
 methods. Variables mentioned in the capture list with no extra symbols are 
 captured by value. Capture-by-reference is accomplished by using the 
@@ -128,15 +130,16 @@ or::
 
 Note that the following two attempts will generate compilation errors::
 
-  [=](){ x = y; };      // capture all lambda arguments by value...
-  [x, &y](){ x = y; };  // capture 'x' by value and 'y' by reference...
+  [=](){ x = y; };      // error: all lambda arguments captured by value,
+                        //        so cannot assign to 'x'.
+  [x, &y](){ x = y; };  // error: cannot assign to 'x' since it is captured
+                        //        by value.
 
-Specifically, it is illegal to assign a value to a variable 'x' that is 
-captured by value since it is `read-only`.
+**Specifically, a variable hat is captured by value is read-only.**
 
------------------------------------
-Notes About C++ Lambdas
------------------------------------
+----------------------------------------
+A Few Notes About Lambda Usage With RAJA 
+----------------------------------------
 
 There are several issues to note about C++ lambda expressions; in particular, 
 with respect to RAJA usage. We describe them here.
@@ -199,13 +202,14 @@ with respect to RAJA usage. We describe them here.
      } );
     
 
- * **Local stack arrays are not captured by CUDA device lambdas.** 
+ * **Local stack arrays may not be captured by CUDA device lambdas.** 
 
-   Although this is inconsistent with the C++ standard, attempting to access 
-   elements in a local stack array in a CUDA device lambda may generate a 
-   compilation error depending on the version of the nvcc compiler you are 
-   using. One solution to this problem is to wrap the array in a 
-   struct; for example::
+   Although this is inconsistent with the C++ standard (local stack arrays
+   are properly captured in lambdas for code that will execute on a CPU), 
+   attempting to access elements in a local stack array in a CUDA device 
+   lambda may generate a compilation error depending on the version of the 
+   nvcc compiler you are using. One solution to this problem is to wrap the 
+   array in a struct; for example::
 
      struct array_wrapper {
        int[4] array;
@@ -217,8 +221,8 @@ with respect to RAJA usage. We describe them here.
        // access entries of bounds.array
      } );
 
-   This issue appears to be resolved in in the 10.1 release of the nvcc 
-   compiler. If you are using an earlier version of nvcc, an implementation
+   This issue appears to be resolved in in the 10.1 release of CUDA. If you 
+   are using an earlier version of nvcc, an implementation
    similar to the one above will be required. 
     
     
@@ -226,17 +230,17 @@ with respect to RAJA usage. We describe them here.
 RAJA Examples
 ================
 
-The remainder of this tutorial illustrates how to use RAJA features using
-various working code examples that are located in  the ``RAJA/examples`` 
+The remainder of this tutorial illustrates how to use RAJA features with
+working code examples that are located in  the ``RAJA/examples`` 
 directory. Additional information about the RAJA features 
 used can be found in :ref:`features-label`.
 
 The examples demonstrate CPU execution (sequential, SIMD, OpenMP
-multi-threading) and CUDA GPU execution. Examples that show how to use
+multithreading) and CUDA GPU execution. Examples that show how to use
 RAJA with other parallel programming model back-ends that are in 
-development will appear when we feel RAJA support for them is sufficiently
-complete and robust. For adventurous users who wish to try experimental 
-features, usage is similar to what is shown in the examples here.
+development will appear in future RAJA releases. For adventurous users who 
+wish to try experimental features, usage is similar to what is shown in the 
+examples here.
 
 All RAJA programming model support features are enabled via CMake options,
 which are described in :ref:`configopt-label`. 
@@ -253,7 +257,7 @@ Simple Loops and Basic RAJA Features
 
 The examples in this section illustrate how to use ``RAJA::forall`` methods
 to execute simple loop kernels; i.e., non-nested loops. It also describes
-iteration spaces, reductions, atomic operations, and scans.
+iteration spaces, reductions, atomic operations, scans, and sorts.
 
 .. toctree::
    :maxdepth: 1
@@ -265,6 +269,7 @@ iteration spaces, reductions, atomic operations, and scans.
    tutorial/reductions.rst
    tutorial/atomic_histogram.rst
    tutorial/scan.rst
+   tutorial/sort.rst
 
 .. _tutorialcomplex-label:
 
@@ -286,3 +291,4 @@ tiling mechanisms to transform loop patterns.
    tutorial/offset-layout.rst
    tutorial/tiled_matrix_transpose.rst
    tutorial/matrix_transpose_local_array.rst
+   tutorial/halo-exchange.rst
diff --git a/docs/sphinx/user_guide/tutorial/add_vectors.rst b/docs/sphinx/user_guide/tutorial/add_vectors.rst
index 891258758d..c9ec806645 100644
--- a/docs/sphinx/user_guide/tutorial/add_vectors.rst
+++ b/docs/sphinx/user_guide/tutorial/add_vectors.rst
@@ -73,7 +73,7 @@ This policy allows the compiler to generate optimizations, such as SIMD if
 compiler heuristics suggest that it is safe to do so and potentially 
 beneficial for performance, but the optimizations are not forced.
 
-To run the kernel with OpenMP multi-threaded parallelism on a CPU, we use the
+To run the kernel with OpenMP multithreaded parallelism on a CPU, we use the
 ``RAJA::omp_parallel_for_exec`` execution policy:
 
 .. literalinclude:: ../../../../examples/tut_add-vectors.cpp
diff --git a/docs/sphinx/user_guide/tutorial/dot_product.rst b/docs/sphinx/user_guide/tutorial/dot_product.rst
index 5e8d328e4f..ae69eb0f55 100644
--- a/docs/sphinx/user_guide/tutorial/dot_product.rst
+++ b/docs/sphinx/user_guide/tutorial/dot_product.rst
@@ -65,7 +65,7 @@ a reduction value type (i.e., 'double'). An initial value of zero for the
 sum is passed to the reduction object constructor. After the kernel executes,
 we use the 'get' method to retrieve the reduced value.
 
-The OpenMP multi-threaded variant of the loop is implemented similarly:
+The OpenMP multithreaded variant of the loop is implemented similarly:
 
 .. literalinclude:: ../../../../examples/tut_dot-product.cpp
    :start-after: _rajaomp_dotprod_start
diff --git a/docs/sphinx/user_guide/tutorial/halo-exchange.rst b/docs/sphinx/user_guide/tutorial/halo-exchange.rst
new file mode 100644
index 0000000000..f1460f964b
--- /dev/null
+++ b/docs/sphinx/user_guide/tutorial/halo-exchange.rst
@@ -0,0 +1,230 @@
+.. ##
+.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+.. ## and RAJA project contributors. See the RAJA/COPYRIGHT file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _halo_exchange-label:
+
+------------------------------------
+Halo Exchange (Workgroup Constructs)
+------------------------------------
+
+Key RAJA features shown in this example:
+
+  * ``RAJA::WorkPool`` workgroup construct
+  * ``RAJA::WorkGroup`` workgroup construct
+  * ``RAJA::WorkSite`` workgroup construct
+  * ``RAJA::RangeSegment`` iteration space construct
+  * RAJA workgroup policies
+
+In this example, we show how to use the RAJA workgroup constructs to implement
+halo exchange packing and unpacking. This may not be speedup halo exchange on
+CPUs but can significantly speedup halo exchange on GPUs compared to using
+``RAJA::forall`` to run individual kernels.
+
+.. note:: Using an abstraction layer over RAJA can make it easy to switch
+          between using individual ``RAJA::forall`` loops or the RAJA workgroup
+          constructs to implement halo exchange packing and unpacking at
+          compile time or run time.
+
+We start by setting the parameters for the halo exchange by using the default
+values or parsing the command line input. These parameters determine the size
+of the mesh, the width of the halo, the number of variables and the number of
+cycles.
+
+.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
+   :start-after: _halo_exchange_input_params_start
+   :end-before: _halo_exchange_input_params_end
+   :language: C++
+
+Next we allocate the variables array (the memory manager in
+the example uses CUDA Unified Memory if CUDA is enabled). These grid variables
+will be reset each cycle to allow checking the results of the packing and
+unpacking.
+
+.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
+   :start-after: _halo_exchange_vars_allocate_start
+   :end-before: _halo_exchange_vars_allocate_end
+   :language: C++
+
+We also allocate and initialize index lists of the grid elements to pack and
+unpack:
+
+.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
+   :start-after: _halo_exchange_index_list_generate_start
+   :end-before: _halo_exchange_index_list_generate_end
+   :language: C++
+
+All the code examples presented below copy the data packed from just inside
+the mesh variable:
+
+  +---+---+---+---+---+
+  | 0 | 0 | 0 | 0 | 0 |
+  +---+---+---+---+---+
+  | 0 | 1 | 2 | 3 | 0 |
+  +---+---+---+---+---+
+  | 0 | 4 | 5 | 6 | 0 |
+  +---+---+---+---+---+
+  | 0 | 7 | 8 | 9 | 0 |
+  +---+---+---+---+---+
+  | 0 | 0 | 0 | 0 | 0 |
+  +---+---+---+---+---+
+
+into the adjacent halo:
+
+  +---+---+---+---+---+
+  | 1 | 1 | 2 | 3 | 3 |
+  +---+---+---+---+---+
+  | 1 | 1 | 2 | 3 | 3 |
+  +---+---+---+---+---+
+  | 4 | 4 | 5 | 6 | 6 |
+  +---+---+---+---+---+
+  | 7 | 7 | 8 | 9 | 9 |
+  +---+---+---+---+---+
+  | 7 | 7 | 8 | 9 | 9 |
+  +---+---+---+---+---+
+
+
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Packing and Unpacking (Basic Loop Execution)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A sequential non-RAJA example of packing:
+
+.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
+   :start-after: _halo_exchange_sequential_cstyle_packing_start
+   :end-before: _halo_exchange_sequential_cstyle_packing_end
+   :language: C++
+
+and unpacking:
+
+.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
+   :start-after: _halo_exchange_sequential_cstyle_unpacking_start
+   :end-before: _halo_exchange_sequential_cstyle_unpacking_end
+   :language: C++
+
+
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+RAJA Variants using forall
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A sequential RAJA example using these policies and types:
+
+.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
+   :start-after: _halo_exchange_loop_forall_policies_start
+   :end-before: _halo_exchange_loop_forall_policies_end
+   :language: C++
+
+of packing:
+
+.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
+   :start-after: _halo_exchange_loop_forall_packing_start
+   :end-before: _halo_exchange_loop_forall_packing_end
+   :language: C++
+
+and unpacking:
+
+.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
+   :start-after: _halo_exchange_loop_forall_unpacking_start
+   :end-before: _halo_exchange_loop_forall_unpacking_end
+   :language: C++
+
+
+For parallel multi-threading execution via OpenMP, the example can be run
+by replacing the execution policy with:
+
+.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
+   :start-after: _halo_exchange_openmp_forall_policies_start
+   :end-before: _halo_exchange_openmp_forall_policies_end
+   :language: C++
+
+Similarly, to run the loops in parallel on a CUDA GPU use this policies:
+
+.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
+   :start-after: _halo_exchange_cuda_forall_policies_start
+   :end-before: _halo_exchange_cuda_forall_policies_end
+   :language: C++
+
+
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RAJA Variants using workgroup constructs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Using the workgroup constructs in the example requires defining a few more
+policies and types:
+
+.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
+   :start-after: _halo_exchange_loop_workgroup_policies_start
+   :end-before: _halo_exchange_loop_workgroup_policies_end
+   :language: C++
+
+which are used in a slightly rearranged version of packing. See how the comment
+indicating where a message could be sent has been moved down after the call to
+run on the workgroup:
+
+.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
+   :start-after: _halo_exchange_loop_workgroup_packing_start
+   :end-before: _halo_exchange_loop_workgroup_packing_end
+   :language: C++
+
+Similarly in the unpacking we wait to receive all of the messages before
+unpacking is done:
+
+.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
+   :start-after: _halo_exchange_loop_workgroup_unpacking_start
+   :end-before: _halo_exchange_loop_workgroup_unpacking_end
+   :language: C++
+
+This reorganization has the downside of not overlapping the message sends with
+packing and the message receives with unpacking.
+
+For parallel multi-threading execution via OpenMP, the example using workgroup
+can be run by replacing the policies and types with:
+
+.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
+   :start-after: _halo_exchange_openmp_workgroup_policies_start
+   :end-before: _halo_exchange_openmp_workgroup_policies_end
+   :language: C++
+
+Similarly, to run the loops in parallel on a CUDA GPU use these policies and
+types, taking note of the unordered work ordering policy that allows the
+enqueued loops to all be run using a single cuda kernel:
+
+.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
+   :start-after: _halo_exchange_cuda_workgroup_policies_start
+   :end-before: _halo_exchange_cuda_workgroup_policies_end
+   :language: C++
+
+The packing is the same as the previous workgroup packing examples with the
+exception of added synchronization after calling run and before sending the
+messages. The previous cuda example used forall to launch
+``num_neighbors * num_vars`` cuda kernels and performed ``num_neighbors``
+synchronizations to send each message in turn. Here the reorganization to pack
+all messages before sending lets us use an unordered cuda work ordering policy
+in the workgroup constructs that reduces the number of cuda kernel launches to
+one. It also allows us to synchronize once before sending all of the messages:
+
+.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
+   :start-after: _halo_exchange_cuda_workgroup_packing_start
+   :end-before: _halo_exchange_cuda_workgroup_packing_end
+   :language: C++
+
+After waiting to receive all of the messages we use workgroup constructs using
+a cuda unordered work ordering policy to unpack all of the messages using a
+single kernel launch:
+
+.. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
+   :start-after: _halo_exchange_cuda_workgroup_unpacking_start
+   :end-before: _halo_exchange_cuda_workgroup_unpacking_end
+   :language: C++
+
+Note that the synchronization after unpacking is done to ensure that
+``group_unpack`` and ``site_unpack`` survive until the unpacking loop has
+finished executing.
+
+
+The file ``RAJA/examples/tut_halo-exchange.cpp`` contains the complete
+working example code.
diff --git a/docs/sphinx/user_guide/tutorial/indexset_segments.rst b/docs/sphinx/user_guide/tutorial/indexset_segments.rst
index cf903a1570..febdf4403d 100644
--- a/docs/sphinx/user_guide/tutorial/indexset_segments.rst
+++ b/docs/sphinx/user_guide/tutorial/indexset_segments.rst
@@ -168,7 +168,7 @@ policy as before.
 Before we end the discussion of these examples, we demonstrate a few more 
 index set execution policy variations. To run the previous three segment 
 code by iterating over the segments sequentially and executing each 
-segment in parallel using OpenMP multi-threading, we would use this policy 
+segment in parallel using OpenMP multithreading, we would use this policy 
 definition:
 
 .. literalinclude:: ../../../../examples/tut_indexset-segments.cpp
diff --git a/docs/sphinx/user_guide/tutorial/matrix_multiply.rst b/docs/sphinx/user_guide/tutorial/matrix_multiply.rst
index f0bdd84fcc..4b3f09c58e 100644
--- a/docs/sphinx/user_guide/tutorial/matrix_multiply.rst
+++ b/docs/sphinx/user_guide/tutorial/matrix_multiply.rst
@@ -170,7 +170,7 @@ reorder for-statements for each loop nest level. These execution patterns
 and transformations can be achieved by changing only the policy and leaving the 
 loop kernel code as is.
 
-If we want to execute the row loop using OpenMP multi-threaded parallelism 
+If we want to execute the row loop using OpenMP multithreaded parallelism 
 and keep the column loop sequential, the policy we would use is:
 
 .. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp
@@ -242,8 +242,8 @@ to specify which arguments each lambda takes and in which order. For example:
 By using ``RAJA::statement::Lambda`` parameters in this way, the code 
 potentially indicates more clearly which areguments are used. Of course, this 
 makes the execution policy more verbose, but that is typically hidden away 
-in a header file. Statements such as ``RAJA::statement::Segs``, and 
-``RAJA::statement::Params`` identify the positions of the segments and params 
+in a header file. Statements such as ``RAJA::Segs``, and 
+``RAJA::Params`` identify the positions of the segments and params 
 in the tuples to be used as arguments to the lambda expressions.
 
 As we noted earlier, the execution policy type passed to the 
diff --git a/docs/sphinx/user_guide/tutorial/matrix_transpose_local_array.rst b/docs/sphinx/user_guide/tutorial/matrix_transpose_local_array.rst
index f36244c8d6..7dd78085c4 100644
--- a/docs/sphinx/user_guide/tutorial/matrix_transpose_local_array.rst
+++ b/docs/sphinx/user_guide/tutorial/matrix_transpose_local_array.rst
@@ -116,7 +116,7 @@ kernel is:
 The ``RAJA::statement::Tile`` types in the execution policy define
 tiling of the outer 'row' (iteration space tuple index '1') and 'col' 
 (iteration space tuple index '0') loops, including tile sizes 
-(``RAJA::statement::tile_fixed`` types) and loop execution policies. Next, 
+(``RAJA::tile_fixed`` types) and loop execution policies. Next, 
 the ``RAJA::statement::InitLocalMem`` type initializes the local stack array
 based on the memory policy type (here, we use ``RAJA::cpu_tile_mem`` for
 a CPU stack-allocated array). The ``RAJA::ParamList<2>`` parameter indicates 
@@ -182,7 +182,7 @@ execution policy and kernel:
    :language: C++
 
 Here, the two ``RAJA::statement::Lambda`` types in the execution policy show
-two different ways to specify the segments (``RAJA::statement::Segs``) 
+two different ways to specify the segments (``RAJA::Segs``) 
 associated with the matrix column and row indices. That is, we can use a 
 ``Segs`` statement for each argument, or include multiple segment ids in one
 statement. 
@@ -191,7 +191,7 @@ Note that we are using ``RAJA::statement::For`` types for the inner tile
 loops instead of `RAJA::statement::ForICount`` types used in the first variant.
 As a consequence of specifying lambda arguments, there are two main differences.
 The local tile indices are properly computed and passed to the lambda 
-expressions as a result of the ``RAJA::statement::Offsets`` types that appear
+expressions as a result of the ``RAJA::Offsets`` types that appear
 in the lambda statement types. The ``RAJA::statement::Lambda`` type for each
 lambda shows the two ways to specify the local tile index args; we can use an
 ``Offsets`` statement for each argument, or include multiple segment ids in one
diff --git a/docs/sphinx/user_guide/tutorial/nested_loop_reorder.rst b/docs/sphinx/user_guide/tutorial/nested_loop_reorder.rst
index def7d1b9ad..7165d8d6dd 100644
--- a/docs/sphinx/user_guide/tutorial/nested_loop_reorder.rst
+++ b/docs/sphinx/user_guide/tutorial/nested_loop_reorder.rst
@@ -103,7 +103,7 @@ for-loops, the loop would appear as::
 
   for (int k = 2; k< 4; ++k) {
     for (int j = 1; j < 3; ++j) { 
-      for (int i = 0; j < 2; ++i) { 
+      for (int i = 0; i < 2; ++i) { 
         // print loop index triple...
       }
     }
diff --git a/docs/sphinx/user_guide/tutorial/reductions.rst b/docs/sphinx/user_guide/tutorial/reductions.rst
index a65b7e94d9..5fbcdfab59 100644
--- a/docs/sphinx/user_guide/tutorial/reductions.rst
+++ b/docs/sphinx/user_guide/tutorial/reductions.rst
@@ -70,7 +70,7 @@ object is retrieved after the kernel by calling a 'get()' method on the
 reduction object. The min-loc/max-loc index values are obtained using 
 'getLoc()' methods.
 
-For parallel multi-threading execution via OpenMP, the example can be run 
+For parallel multithreading execution via OpenMP, the example can be run 
 by replacing the execution and reduction policies with:
 
 .. literalinclude:: ../../../../examples/tut_reductions.cpp
diff --git a/docs/sphinx/user_guide/tutorial/scan.rst b/docs/sphinx/user_guide/tutorial/scan.rst
index db264781db..c5eec8d9d1 100644
--- a/docs/sphinx/user_guide/tutorial/scan.rst
+++ b/docs/sphinx/user_guide/tutorial/scan.rst
@@ -69,7 +69,7 @@ We can be explicit about the operation used in the scan by passing the
 
 The result in the 'out' array is the same.
 
-An inclusive parallel scan operation using OpenMP multi-threading is
+An inclusive parallel scan operation using OpenMP multithreading is
 accomplished similarly by replacing the execution policy type:
 
 .. literalinclude:: ../../../../examples/tut_scan.cpp
diff --git a/docs/sphinx/user_guide/tutorial/sort.rst b/docs/sphinx/user_guide/tutorial/sort.rst
new file mode 100644
index 0000000000..27ac6310a1
--- /dev/null
+++ b/docs/sphinx/user_guide/tutorial/sort.rst
@@ -0,0 +1,204 @@
+.. ##
+.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+.. ## and RAJA project contributors. See the RAJA/COPYRIGHT file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _sort-label:
+
+--------------------------------------------------
+Parallel Sort Operations
+--------------------------------------------------
+
+Key RAJA features shown in this section:
+
+  * ``RAJA::sort`` operation
+  * ``RAJA::sort_pairs`` operation
+  * ``RAJA::stable_sort`` operation
+  * ``RAJA::stable_sort_pairs`` operation
+  * RAJA comparators for different types of sorts; e.g., less, greater
+
+Below, we present examples of RAJA sequential, OpenMP,
+and CUDA sort operations and show how different sort orderings can be
+achieved by passing different RAJA comparators to the RAJA sort template
+methods. Each comparator is a template type, where the template argument is
+the type of the values it compares. For a summary of RAJA sort
+functionality, please see :ref:`sort-label`.
+
+.. note:: RAJA sort operations use the same execution policy types that
+          ``RAJA::forall`` loop execution templates do.
+
+Each of the examples below uses the same integer arrays for input
+and output values. We set the input array and print them as follows:
+
+.. literalinclude:: ../../../../examples/tut_sort.cpp
+   :start-after: _sort_array_init_start
+   :end-before: _sort_array_init_end
+   :language: C++
+
+This generates the following sequence of values in the ``in`` array::
+
+   6 7 2 1 0 9 4 8 5 3 4 9 6 3 7 0 1 8 2 5
+
+This generates the following sequence of values in the ``in`` and ``in_vals``
+arrays::
+
+   (6,0) (7,0) (2,0) (1,0) (0,0) (9,0) (4,0) (8,0) (5,0) (3,0)
+   (4,1) (9,1) (6,1) (3,1) (7,1) (0,1) (1,1) (8,1) (2,1) (5,1)
+
+^^^^^^^^^^^^^^^^
+Unstable Sorts
+^^^^^^^^^^^^^^^^
+
+A sequential unstable sort operation is performed by:
+
+.. literalinclude:: ../../../../examples/tut_sort.cpp
+   :start-after: _sort_seq_start
+   :end-before: _sort_seq_end
+   :language: C++
+
+Since no comparator is passed to the sort method, the default less operation
+is applied and the result generated in the ``out`` array is non-decreasing sort
+on the ``out`` array. The resulting ``out`` array contains the values::
+
+   0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9
+
+We can be explicit about the operation used in the sort by passing the
+less operator to the sort method:
+
+.. literalinclude:: ../../../../examples/tut_sort.cpp
+   :start-after: _sort_seq_less_start
+   :end-before: _sort_seq_less_end
+   :language: C++
+
+The result in the ``out`` array is the same.
+
+An unstable parallel sort operation using OpenMP multi-threading is
+accomplished similarly by replacing the execution policy type:
+
+.. literalinclude:: ../../../../examples/tut_sort.cpp
+   :start-after: _sort_omp_less_start
+   :end-before: _sort_omp_less_end
+   :language: C++
+
+As is commonly done with RAJA, the only difference between this code and
+the previous one is that the execution policy is different. If we want to
+run the sort on a GPU using CUDA, we would use a CUDA execution policy. This
+will be shown shortly.
+
+^^^^^^^^^^^^^^^^
+Stable Sorts
+^^^^^^^^^^^^^^^^
+
+A sequential stable sort (less) operation is performed by:
+
+.. literalinclude:: ../../../../examples/tut_sort.cpp
+   :start-after: _sort_stable_seq_less_start
+   :end-before: _sort_stable_seq_less_end
+   :language: C++
+
+This generates the following sequence of values in the output array::
+
+   0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9
+
+Note that the stable sort result is the same as the unstable sort in this case
+because we are sorting integers. We will show an example of sorting pairs later
+where this is not the case.
+
+Running the same sort operation on a GPU using CUDA is done by:
+
+.. literalinclude:: ../../../../examples/tut_sort.cpp
+   :start-after: _sort_stable_cuda_less_start
+   :end-before: _sort_stable_cuda_less_end
+   :language: C++
+
+Note that we pass the number of threads per CUDA thread block as the template
+argument to the CUDA execution policy as we do in other cases.
+
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Other Comparators
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Using a different comparator allows sorting in a different order.
+Here is a sequential stable sort that uses the greater operator:
+
+.. literalinclude:: ../../../../examples/tut_sort.cpp
+   :start-after: _sort_stable_seq_greater_start
+   :end-before: _sort_stable_seq_greater_end
+   :language: C++
+
+This generates the following sequence of values in non-increasing order in
+the output array::
+
+   9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0
+
+Note that the only operators provided by RAJA that are valid to use in sort
+because they form a strict weak ordering of elements for arithmetic types are
+less and greater. Also note that the the cuda sort backend only supports
+RAJA's operators less and greater.
+
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Sort Pairs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Sort *Pairs* operations generate the same results as the sort operations
+we have just described. However, an additional array of values is also permuted
+to match the sorted array so **two arrays are passed to sort pairs methods.**
+
+Here is a sequential unstable sort pairs that uses the less operator:
+
+.. literalinclude:: ../../../../examples/tut_sort.cpp
+   :start-after: _sort_pairs_seq_less_start
+   :end-before: _sort_pairs_seq_less_end
+   :language: C++
+
+This generates the following sequence in the output array::
+
+   (0,0) (0,1) (1,0) (1,1) (2,0) (2,1) (3,0) (3,1) (4,0) (4,1)
+   (5,1) (5,0) (6,1) (6,0) (7,0) (7,1) (8,0) (8,1) (9,1) (9,0)
+
+Note that some of the pairs with equivalent keys stayed in the same order
+they appeared in the unsorted arrays like ``(8,0) (8,1)``, while others are
+reversed like ``(9,1) (9,0)``.
+
+Here is a sequential stable sort pairs that uses the greater operator:
+
+.. literalinclude:: ../../../../examples/tut_sort.cpp
+   :start-after: _sort_stable_pairs_seq_greater_start
+   :end-before: _sort_stable_pairs_seq_greater_end
+   :language: C++
+
+This generates the following sequence in the output array::
+
+   (9,0) (9,1) (8,0) (8,1) (7,0) (7,1) (6,0) (6,1) (5,0) (5,1)
+   (4,0) (4,1) (3,0) (3,1) (2,0) (2,1) (1,0) (1,1) (0,0) (0,1)
+
+Note that all pairs with equivalent keys stayed in the same order that they
+appeared in the unsorted arrays.
+
+As you may expect at this point, running an stable sort pairs
+operation using OpenMP is accomplished by:
+
+.. literalinclude:: ../../../../examples/tut_sort.cpp
+   :start-after: _sort_stable_pairs_omp_greater_start
+   :end-before: _sort_stable_pairs_omp_greater_start
+   :language: C++
+
+This generates the following sequence in the output array (as we saw earlier)::
+
+   (9,0) (9,1) (8,0) (8,1) (7,0) (7,1) (6,0) (6,1) (5,0) (5,1)
+   (4,0) (4,1) (3,0) (3,1) (2,0) (2,1) (1,0) (1,1) (0,0) (0,1)
+
+and the only difference is the execution policy template parameter.
+
+Lastly, we show a parallel unstable sort pairs operation using CUDA:
+
+.. literalinclude:: ../../../../examples/tut_sort.cpp
+   :start-after: _sort_pairs_cuda_greater_start
+   :end-before: _sort_pairs_cuda_greater_start
+   :language: C++
+
+The file ``RAJA/examples/tut_sort.cpp`` contains the complete
+working example code.
diff --git a/docs/sphinx/user_guide/tutorial/tiled_matrix_transpose.rst b/docs/sphinx/user_guide/tutorial/tiled_matrix_transpose.rst
index 382023bd81..5707a2a2e9 100644
--- a/docs/sphinx/user_guide/tutorial/tiled_matrix_transpose.rst
+++ b/docs/sphinx/user_guide/tutorial/tiled_matrix_transpose.rst
@@ -60,7 +60,7 @@ RAJA::kernel Variants
 ^^^^^^^^^^^^^^^^^^^^^
 
 For ``RAJA::kernel`` variants, we use ``RAJA::statement::Tile`` types
-for the outer loop tiling and ``RAJA::statement::tile_fixed`` types to 
+for the outer loop tiling and ``RAJA::tile_fixed`` types to 
 indicate the tile dimensions. The complete sequential RAJA variant is:
 
 .. literalinclude:: ../../../../examples/tut_tiled-matrix-transpose.cpp
diff --git a/docs/sphinx/user_guide/tutorial/vertexsum_coloring.rst b/docs/sphinx/user_guide/tutorial/vertexsum_coloring.rst
index 4b6ecd1f3b..f45fdbc6cf 100644
--- a/docs/sphinx/user_guide/tutorial/vertexsum_coloring.rst
+++ b/docs/sphinx/user_guide/tutorial/vertexsum_coloring.rst
@@ -102,7 +102,7 @@ using the vectors:
 
 Now, we can use an index set execution policy that iterates over the 
 segments sequentially and executes each segment in parallel using OpenMP
-multi-threading (and ``RAJA::forall``):
+multithreading (and ``RAJA::forall``):
 
 .. literalinclude:: ../../../../examples/tut_vertexsum-coloring.cpp
    :start-after: _raja_seq_colorindexset_vertexsum_start
diff --git a/docs/sphinx/user_guide/using_raja.rst b/docs/sphinx/user_guide/using_raja.rst
index 5d04ef5e68..458585186d 100644
--- a/docs/sphinx/user_guide/using_raja.rst
+++ b/docs/sphinx/user_guide/using_raja.rst
@@ -12,11 +12,12 @@
 Using RAJA in Your Application
 ******************************
 
-Using RAJA in an application requires two things: ensuring the header files
+Using RAJA in an application requires two things: ensuring the RAJA header files
 are visible, and linking against the RAJA library. We maintain a 
 `RAJA Template Project <https://github.com/LLNL/RAJA-project-template>`_
-shows how to use RAJA in a CMake project, either as a Git submodule or
-as an externally installed library that you link your application against.
+that shows how to use RAJA in a project that uses CMake or make, either as a 
+Git submodule or as an externally installed library that you link your 
+application against.
 
 ========================
 CMake Configuration File
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 6f5be57599..b488e88050 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -5,6 +5,10 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
+raja_add_executable(
+  NAME resource-forall
+  SOURCES resource-forall.cpp)
+
 raja_add_executable(
   NAME tut_daxpy
   SOURCES tut_daxpy.cpp)
@@ -36,11 +40,15 @@ raja_add_executable(
 raja_add_executable(
   NAME tut_reductions
   SOURCES tut_reductions.cpp)
-  
+
 raja_add_executable(
   NAME tut_scan
   SOURCES tut_scan.cpp)
 
+raja_add_executable(
+  NAME tut_sort
+  SOURCES tut_sort.cpp)
+
 raja_add_executable(
   NAME tut_atomic-histogram
   SOURCES tut_atomic-histogram.cpp)
@@ -61,10 +69,18 @@ raja_add_executable(
   NAME tut_tiled-matrix-transpose
   SOURCES tut_tiled-matrix-transpose.cpp)
 
+raja_add_executable(
+  NAME tut_halo-exchange
+  SOURCES tut_halo-exchange.cpp)
+
 raja_add_executable(
   NAME pi-reduce_vs_atomic
   SOURCES pi-reduce_vs_atomic.cpp)
 
+raja_add_executable(
+  NAME raja-teams
+  SOURCES raja-teams.cpp)
+
 raja_add_executable(
   NAME jacobi
   SOURCES jacobi.cpp)
@@ -76,11 +92,15 @@ raja_add_executable(
 raja_add_executable(
   NAME wave-eqn
   SOURCES wave-eqn.cpp)
-  
+
 raja_add_executable(
   NAME ltimes
   SOURCES ltimes.cpp)
 
+raja_add_executable(
+  NAME multiview
+  SOURCES multiview.cpp)
+
 if(ENABLE_TARGET_OPENMP)
   raja_add_executable(
     NAME target-kernel
@@ -91,4 +111,8 @@ if(ENABLE_TARGET_OPENMP)
     SOURCES omp-target-ltimes.cpp)
 endif()
 
+raja_add_executable(
+  NAME kernel-dynamic-tile
+  SOURCES kernel-dynamic-tile.cpp)
+
 add_subdirectory(plugin)
diff --git a/examples/jacobi.cpp b/examples/jacobi.cpp
index a2c43db22b..f391e5a8a1 100644
--- a/examples/jacobi.cpp
+++ b/examples/jacobi.cpp
@@ -317,8 +317,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   using jacobiCUDANestedPolicy = RAJA::KernelPolicy<
     RAJA::statement::CudaKernel<
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<32>, RAJA::cuda_block_y_loop,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<32>, RAJA::cuda_block_x_loop,
+      RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop,
           RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
             RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
               RAJA::statement::Lambda<0>
@@ -394,8 +394,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   using jacobiHIPNestedPolicy = RAJA::KernelPolicy<
     RAJA::statement::HipKernel<
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<32>, RAJA::hip_block_y_loop,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<32>, RAJA::hip_block_x_loop,
+      RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop,
           RAJA::statement::For<1, RAJA::hip_thread_y_direct,
             RAJA::statement::For<0, RAJA::hip_thread_x_direct,
               RAJA::statement::Lambda<0>
diff --git a/examples/kernel-dynamic-tile.cpp b/examples/kernel-dynamic-tile.cpp
new file mode 100644
index 0000000000..5de2123425
--- /dev/null
+++ b/examples/kernel-dynamic-tile.cpp
@@ -0,0 +1,34 @@
+#include "RAJA/RAJA.hpp"
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+  std::cout << "\n\nRAJA dynamic_tile example...\n\n";
+
+//Using policy = KernelPolicy<Tile<tile_dynamic<0>, seq_exec, …>>;
+//RAJA::kernel_param<policy>(
+// make_tuple(RangeSegment(0,N)),
+//  make_tuple(32),  // param 0 is referenced by tile_dynamic
+//  [=](int i, int tile_size){
+//
+//  });
+
+  using namespace RAJA;
+
+  kernel_param<
+    KernelPolicy<
+      statement::Tile<1, tile_dynamic<1>, seq_exec,
+        statement::Tile<0, tile_dynamic<0>, seq_exec,
+          statement::For<1, seq_exec,
+             statement::For<0, seq_exec, statement::Lambda<0>>
+          >
+        >
+      >
+    >
+  >(make_tuple(RangeSegment{0,25}, RangeSegment{0,25}),
+      make_tuple(TileSize{5}, TileSize{10}),
+     //make_tuple(TileSize(10)), // not sure we need this, good for static_assert
+     [=](int i, int j, TileSize x, TileSize y){
+       std::cout << "Running index (" << i << "," << j << ") of " << x.size << "x" << y.size << " tile." << std::endl;
+  });
+
+}
diff --git a/examples/ltimes.cpp b/examples/ltimes.cpp
index 3cd769cb50..266859ac20 100644
--- a/examples/ltimes.cpp
+++ b/examples/ltimes.cpp
@@ -25,6 +25,7 @@
 #include <hip/hip_runtime.h>
 #endif
 
+
 /*
  *  LTimes Example
  *
@@ -276,6 +277,73 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 //----------------------------------------------------------------------------//
 
+{
+  std::cout << "\n Running RAJA sequential ARGS version of LTimes...\n";
+
+  std::memset(phi_data, 0, phi_size * sizeof(double));
+
+  //
+  // View types and Views/Layouts for indexing into arrays
+  //
+  // L(m, d) : 1 -> d is stride-1 dimension
+  using LView = TypedView<double, Layout<2, Index_type, 1>, IM, ID>;
+
+  // psi(d, g, z) : 2 -> z is stride-1 dimension
+  using PsiView = TypedView<double, Layout<3, Index_type, 2>, ID, IG, IZ>;
+
+  // phi(m, g, z) : 2 -> z is stride-1 dimension
+  using PhiView = TypedView<double, Layout<3, Index_type, 2>, IM, IG, IZ>;
+
+  std::array<RAJA::idx_t, 2> L_perm {{0, 1}};
+  LView L(L_data,
+          RAJA::make_permuted_layout({{num_m, num_d}}, L_perm));
+
+  std::array<RAJA::idx_t, 3> psi_perm {{0, 1, 2}};
+  PsiView psi(psi_data,
+              RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm));
+
+  std::array<RAJA::idx_t, 3> phi_perm {{0, 1, 2}};
+  PhiView phi(phi_data,
+              RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm));
+
+  using EXECPOL =
+    RAJA::KernelPolicy<
+       statement::For<0, loop_exec,  // m
+         statement::For<1, loop_exec,  // d
+           statement::For<2, loop_exec,  // g
+             statement::For<3, simd_exec,  // z
+               statement::Lambda<0, Segs<0, 1, 2, 3>>
+             >
+           >
+         >
+       >
+     >;
+
+  auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
+                                   RAJA::TypedRangeSegment<ID>(0, num_d),
+                                   RAJA::TypedRangeSegment<IG>(0, num_g),
+                                   RAJA::TypedRangeSegment<IZ>(0, num_z));
+
+  RAJA::Timer timer;
+  timer.start();
+
+  RAJA::kernel<EXECPOL>( segments,
+    [=] (IM m, ID d, IG g, IZ z) {
+       phi(m, g, z) += L(m, d) * psi(d, g, z);
+    }
+  );
+
+  timer.stop();
+  std::cout << "  RAJA sequential ARGS version of LTimes run time (sec.): "
+            << timer.elapsed() << std::endl;
+
+#if defined(DEBUG_LTIMES)
+  checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
+#endif
+}
+
+//----------------------------------------------------------------------------//
+
 {
   std::cout << "\n Running RAJA sequential shmem version of LTimes...\n";
 
@@ -283,14 +351,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   //
   // View types and Views/Layouts for indexing into arrays
-  // 
-  // L(m, d) : 1 -> d is stride-1 dimension 
+  //
+  // L(m, d) : 1 -> d is stride-1 dimension
   using LView = TypedView<double, Layout<2, Index_type, 1>, IM, ID>;
 
-  // psi(d, g, z) : 2 -> z is stride-1 dimension 
+  // psi(d, g, z) : 2 -> z is stride-1 dimension
   using PsiView = TypedView<double, Layout<3, Index_type, 2>, ID, IG, IZ>;
 
-  // phi(m, g, z) : 2 -> z is stride-1 dimension 
+  // phi(m, g, z) : 2 -> z is stride-1 dimension
   using PhiView = TypedView<double, Layout<3, Index_type, 2>, IM, IG, IZ>;
 
   std::array<RAJA::idx_t, 2> L_perm {{0, 1}};
@@ -312,54 +380,63 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   using RAJA::statement::Param;
 
-  using EXECPOL = 
+  using EXECPOL =
     RAJA::KernelPolicy<
 
       // Create memory tiles
       statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1,2>,
 
       // Tile outer m,d loops
-      statement::Tile<0, statement::tile_fixed<tile_m>, loop_exec,  // m
-        statement::Tile<1, statement::tile_fixed<tile_d>, loop_exec,  // d
+      statement::Tile<0, tile_fixed<tile_m>, loop_exec,  // m
+        statement::Tile<1, tile_fixed<tile_d>, loop_exec,  // d
 
 
             // Load L(m,d) for m,d tile into shmem
-            statement::ForICount<0, Param<3>, loop_exec,  // m
-              statement::ForICount<1, Param<4>, loop_exec,  // d
-                statement::Lambda<1>
+            statement::For<0, loop_exec,  // m
+              statement::For<1, loop_exec,  // d
+                statement::Lambda<0, Segs<0, 1>,
+                                     Params<0>,
+                                     Offsets<0, 1>>
               >
             >,
 
             // Run inner g, z loops with z loop tiled
             statement::For<2, loop_exec,  // g
-              statement::Tile<3, statement::tile_fixed<tile_z>, loop_exec,  // z
+              statement::Tile<3, tile_fixed<tile_z>, loop_exec,  // z
 
 
                   // Load psi into shmem
-                  statement::ForICount<1, Param<4>, loop_exec,  // d
-                    statement::ForICount<3, Param<6>, loop_exec,  // z
-                      statement::Lambda<2> 
+                  statement::For<1, loop_exec,  // d
+                    statement::For<3, loop_exec,  // z
+                      statement::Lambda<1, Segs<1, 2, 3>,
+                                           Params<1>,
+                                           Offsets<1, 2, 3>>
                     >
                   >,
 
                   // Compute phi
-                  statement::ForICount<0, Param<3>, loop_exec,  // m
+                  statement::For<0, loop_exec,  // m
 
                     // Load phi into shmem
-                    statement::ForICount<3, Param<6>, loop_exec,  // z
-                      statement::Lambda<3>
+                    statement::For<3, loop_exec,  // z
+                      statement::Lambda<2, Segs<0, 2, 3>,
+                                           Params<2>,
+                                           Offsets<0, 2, 3>>
                     >,
 
-                    // Compute phi in shmem 
-                    statement::ForICount<1, Param<4>, loop_exec,  // d
-                      statement::ForICount<3, Param<6>, loop_exec,  // z
-                        statement::Lambda<4>
+                    // Compute phi in shmem
+                    statement::For<1, loop_exec,  // d
+                      statement::For<3, loop_exec,  // z
+                        statement::Lambda<3, Params<0, 1, 2>,
+                                             Offsets<0, 1, 2, 3>>
                       >
                     >,
 
                     // Store phi
-                    statement:: ForICount<3, Param<6>, loop_exec,  // z
-                      statement::Lambda<5>
+                    statement:: For<3, loop_exec,  // z
+                      statement::Lambda<4, Segs<0, 2, 3>,
+                                           Params<2>,
+                                           Offsets<0, 2, 3>>
                     >
                   >  // m
 
@@ -371,7 +448,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
         >  // Tile d
       >  // Tile m
       > // LocalMemory
-    >; // KernelPolicy 
+    >; // KernelPolicy
 
 
 
@@ -379,31 +456,31 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // Define statically dimensioned local arrays used in kernel
   //
 
-  using shmem_L_t = RAJA::TypedLocalArray<double, 
+  using shmem_L_t = RAJA::TypedLocalArray<double,
                         RAJA::PERM_JI,
                         RAJA::SizeList<tile_m, tile_d>,
                         IM, ID>;
   shmem_L_t shmem_L;
 
 
-  using shmem_psi_t = RAJA::TypedLocalArray<double, 
+  using shmem_psi_t = RAJA::TypedLocalArray<double,
                         RAJA::PERM_IJK,
                         RAJA::SizeList<tile_d, tile_g, tile_z>,
                         ID, IG, IZ>;
   shmem_psi_t shmem_psi;
-  
-  
-  using shmem_phi_t = RAJA::TypedLocalArray<double, 
+
+
+  using shmem_phi_t = RAJA::TypedLocalArray<double,
                         RAJA::PERM_IJK,
                         RAJA::SizeList<tile_m, tile_g, tile_z>,
                         IM, IG, IZ>;
   shmem_phi_t shmem_phi;
- 
+
 
   RAJA::Timer timer;
   timer.start();
 
-  RAJA::kernel_param<EXECPOL>( 
+  RAJA::kernel_param<EXECPOL>(
 
     RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
                      RAJA::TypedRangeSegment<ID>(0, num_d),
@@ -413,57 +490,44 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     // They are the last args in all lambdas (after indices).
     RAJA::make_tuple( shmem_L,
                       shmem_psi,
-                      shmem_phi,
-                      IM(0),
-                      ID(0),
-                      IG(0),
-                      IZ(0)
-                      ),
-
-    // Lambda<0> : Single lambda version
-    [=] (IM m, ID d, IG g, IZ z,
-         shmem_L_t&, shmem_psi_t&, shmem_phi_t&,
-         IM , ID , IG , IZ ) 
-    {
-      phi(m, g, z) += L(m, d) * psi(d, g, z);
-    }, 
+                      shmem_phi),
 
-    // Lambda<1> : Load L into shmem
-    [=] (IM m, ID d, IG /*g*/, IZ /*z*/,
-         shmem_L_t& sh_L, shmem_psi_t&, shmem_phi_t&,
-         IM tm, ID td, IG , IZ ) 
+
+    // Lambda<0> : Load L into shmem
+    [=] (IM m, ID d,
+         shmem_L_t& sh_L,
+         IM tm, ID td)
     {
       sh_L(tm, td) = L(m, d);
     },
 
-    // Lambda<2> : Load psi into shmem
-    [=] (IM /*m*/, ID d, IG g, IZ z,
-         shmem_L_t&, shmem_psi_t& sh_psi, shmem_phi_t&,
-         IM , ID td, IG tg, IZ tz) 
+    // Lambda<1> : Load psi into shmem
+    [=] (ID d, IG g, IZ z,
+         shmem_psi_t& sh_psi,
+         ID td, IG tg, IZ tz)
     {
       sh_psi(td, tg, tz) = psi(d, g, z);
     },
 
-    // Lambda<3> : Load phi into shmem
-    [=] (IM m, ID /*d*/, IG g, IZ z,
-         shmem_L_t&, shmem_psi_t&, shmem_phi_t& sh_phi,
-         IM tm, ID , IG tg, IZ tz) 
+    // Lambda<2> : Load phi into shmem
+    [=] (IM m, IG g, IZ z,
+         shmem_phi_t& sh_phi,
+         IM tm, IG tg, IZ tz)
     {
       sh_phi(tm, tg, tz) = phi(m, g, z);
     },
 
-    // Lambda<4> : Compute phi in shmem
-    [=] (IM , ID , IG , IZ ,
-         shmem_L_t& sh_L, shmem_psi_t& sh_psi, shmem_phi_t& sh_phi,
-         IM tm, ID td, IG tg, IZ tz) 
+    // Lambda<3> : Compute phi in shmem
+    [=] (shmem_L_t& sh_L, shmem_psi_t& sh_psi, shmem_phi_t& sh_phi,
+        IM tm, ID td, IG tg, IZ tz)
     {
       sh_phi(tm, tg, tz) += sh_L(tm, td) * sh_psi(td, tg, tz);
     },
 
-    // Lambda<5> : Store phi
-    [=] (IM m, ID /*d*/, IG g, IZ z,
-         shmem_L_t&, shmem_psi_t&, shmem_phi_t& sh_phi,
-         IM tm, ID , IG tg, IZ tz) 
+    // Lambda<4> : Store phi
+    [=] (IM m, IG g, IZ z,
+         shmem_phi_t& sh_phi,
+         IM tm, IG tg, IZ tz)
     {
       phi(m, g, z) = sh_phi(tm, tg, tz);
     }
@@ -478,6 +542,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   checkResult(phi, L, psi, num_m, num_d, num_g, num_z);
 #endif
 }
+
 //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
@@ -742,50 +807,52 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // Define our execution policy
   //
 
-  using RAJA::statement::Param;
+  using RAJA::Segs;
+  using RAJA::Params;
+  using RAJA::Offsets;
 
   using EXECPOL =
     RAJA::KernelPolicy<
       statement::CudaKernelAsync<
         statement::InitLocalMem<cuda_shared_mem, ParamList<0,1>,
           // Tile outer m,d loops 
-          statement::Tile<0, statement::tile_fixed<tile_m>, seq_exec,  // m
-            statement::Tile<1, statement::tile_fixed<tile_d>, seq_exec,  // d
+          statement::Tile<0, tile_fixed<tile_m>, seq_exec,  // m
+            statement::Tile<1, tile_fixed<tile_d>, seq_exec,  // d
 
               // Load L for m,d tile into shmem 
-              statement::ForICount<1, Param<4>, cuda_thread_x_loop,  // d
-                statement::ForICount<0, Param<3>, cuda_thread_y_direct,   // m
-                  statement::Lambda<0>
+              statement::For<1, cuda_thread_x_loop,  // d
+                statement::For<0, cuda_thread_y_direct,   // m
+                  statement::Lambda<0, Segs<0,1>, Params<0>, Offsets<0,1>>
                 >
               >,
               statement::CudaSyncThreads,
 
               // Distribute g, z across blocks and tile z
               statement::For<2, cuda_block_y_loop, // g
-                statement::Tile<3, statement::tile_fixed<tile_z>, cuda_block_x_loop,  // z
+                statement::Tile<3, tile_fixed<tile_z>, cuda_block_x_loop,  // z
 
                   // Load phi into thread local storage
-                  statement::ForICount<3, Param<6>, cuda_thread_x_direct,  // z
-                    statement::ForICount<0, Param<3>, cuda_thread_y_direct, // m
-                      statement::Lambda<2>
+                  statement::For<3, cuda_thread_x_direct,  // z
+                    statement::For<0, cuda_thread_y_direct, // m
+                      statement::Lambda<2, Segs<0,2,3>, Params<2>>
                     >
                   >,
 
                   // Load slice of psi into shmem
-                  statement::ForICount<3, Param<6>, cuda_thread_x_direct,  // z
-                    statement::ForICount<1, Param<4>, cuda_thread_y_loop, // d (reusing y)
-                      statement::Lambda<1>
+                  statement::For<3,cuda_thread_x_direct,  // z
+                    statement::For<1, cuda_thread_y_loop, // d (reusing y)
+                      statement::Lambda<1, Segs<1,2,3>, Params<1>, Offsets<1,2,3>>
                     >
                   >,
                   statement::CudaSyncThreads,
 
                   // Compute phi
-                  statement::ForICount<3, Param<6>, cuda_thread_x_direct,  // z
-                    statement::ForICount<0, Param<3>, cuda_thread_y_direct, // m
+                  statement::For<3, cuda_thread_x_direct,  // z
+                    statement::For<0, cuda_thread_y_direct, // m
 
                       // Compute thread-local Phi value and store
-                      statement::ForICount<1, Param<4>, seq_exec,  // d
-                        statement::Lambda<3>
+                      statement::For<1,  seq_exec,  // d
+                        statement::Lambda<3, Segs<0,1,2,3>, Params<0,1,2>, Offsets<0,1,2,3>>
                       > // d
                     >  // m
                   >,  // z
@@ -794,9 +861,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
                   statement::CudaSyncThreads,
 
                   // Write out phi from thread local storage
-                  statement::ForICount<3, Param<6>, cuda_thread_x_direct,  // z
-                    statement::ForICount<0, Param<3>, cuda_thread_y_direct, // m
-                      statement::Lambda<4>
+                  statement::For<3, cuda_thread_x_direct,  // z
+                    statement::For<0, cuda_thread_y_direct, // m
+                      statement::Lambda<4, Segs<0,2,3>, Params<2>>
                     >
                   >,
                   statement::CudaSyncThreads
@@ -820,7 +887,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   timer.start();
 
   RAJA::kernel_param<EXECPOL>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
+      RAJA::make_tuple(
+      RAJA::TypedRangeSegment<IM>(0, num_m),
       RAJA::TypedRangeSegment<ID>(0, num_d),
       RAJA::TypedRangeSegment<IG>(0, num_g),
       RAJA::TypedRangeSegment<IZ>(0, num_z)),
@@ -831,31 +899,26 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     // computing a phi value, for shared memory before writing to phi array.
     RAJA::make_tuple( shmem_L,
                       shmem_psi,
-                      0.0,
-                      IM(0),
-                      ID(0),
-                      IG(0),
-                      IZ(0)),
+                      0.0),
 
     // Lambda<0> : Load L into shmem
-    [=] RAJA_DEVICE (IM m, ID d, IG g, IZ z,
-                     shmem_L_t& sh_L, shmem_psi_t&, double&,
-                     IM tm, ID td, IG, IZ) {
+    [=] RAJA_DEVICE (IM m, ID d,
+                     shmem_L_t& sh_L,
+                     IM tm, ID td) {
       sh_L(tm, td) = L(m, d);
     },
 
     // Lambda<1> : Load slice of psi into shmem
-    [=] RAJA_DEVICE (IM /*m*/, ID d, IG g, IZ z,
-                    shmem_L_t&, shmem_psi_t& sh_psi, double&,
-                     IM, ID td, IG tg, IZ tz) {
+    [=] RAJA_DEVICE (ID d, IG g, IZ z,
+                     shmem_psi_t& sh_psi,
+                     ID td, IG tg, IZ tz) {
 
       sh_psi(td, tg, tz) = psi(d, g, z);
     },
 
     // Lambda<2> : Load thread-local phi value
-    [=] RAJA_DEVICE (IM m, ID /*d*/, IG g, IZ z,
-                     shmem_L_t&, shmem_psi_t&, double& phi_local,
-                     IM, ID, IG, IZ) {
+    [=] RAJA_DEVICE (IM m, IG g, IZ z,
+                     double& phi_local) {
 
       phi_local = phi(m, g, z);
     },
@@ -869,9 +932,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     },
 
     // Lambda<4> : Store phi
-    [=] RAJA_DEVICE (IM m, ID /*d*/, IG g, IZ z,
-                     shmem_L_t&, shmem_psi_t&, double& phi_local,
-                     IM, ID, IG, IZ) {
+    [=] RAJA_DEVICE (IM m, IG g, IZ z,
+                     double& phi_local) {
 
       phi(m, g, z) = phi_local;
     }
@@ -1083,64 +1145,67 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
 
   using RAJA::statement::Param;
+  using RAJA::Segs;
+  using RAJA::Params;
+  using RAJA::Offsets;
 
   using EXECPOL =
     RAJA::KernelPolicy<
       statement::HipKernelAsync<
         statement::InitLocalMem<hip_shared_mem, ParamList<0,1>,
-          // Tile outer m,d loops
-          statement::Tile<0, statement::tile_fixed<tile_m>, seq_exec,  // m
-            statement::Tile<1, statement::tile_fixed<tile_d>, seq_exec,  // d
-
-              // Load L for m,d tile into shmem
-              statement::ForICount<1, Param<4>, hip_thread_x_loop,  // d
-                statement::ForICount<0, Param<3>, hip_thread_y_direct,   // m
-                  statement::Lambda<0>
+          // Tile outer m,d loops 
+          statement::Tile<0, tile_fixed<tile_m>, seq_exec,  // m
+            statement::Tile<1, tile_fixed<tile_d>, seq_exec,  // d
+
+              // Load L for m,d tile into shmem 
+              statement::For<1, hip_thread_x_loop,  // d
+                statement::For<0, hip_thread_y_direct,   // m
+                  statement::Lambda<0, Segs<0,1>, Params<0>, Offsets<0,1>>
                 >
               >,
               statement::HipSyncThreads,
 
               // Distribute g, z across blocks and tile z
               statement::For<2, hip_block_y_loop, // g
-                statement::Tile<3, statement::tile_fixed<tile_z>, hip_block_x_loop,  // z
+                statement::Tile<3, tile_fixed<tile_z>, hip_block_x_loop,  // z
 
                   // Load phi into thread local storage
-                  statement::ForICount<3, Param<6>, hip_thread_x_direct,  // z
-                    statement::ForICount<0, Param<3>, hip_thread_y_direct, // m
-                      statement::Lambda<2>
+                  statement::For<3, hip_thread_x_direct,  // z
+                    statement::For<0, hip_thread_y_direct, // m
+                      statement::Lambda<2, Segs<0,2,3>, Params<2>>
                     >
                   >,
 
                   // Load slice of psi into shmem
-                  statement::ForICount<3, Param<6>, hip_thread_x_direct,  // z
-                    statement::ForICount<1, Param<4>, hip_thread_y_loop, // d (reusing y)
-                      statement::Lambda<1>
+                  statement::For<3, hip_thread_x_direct,  // z
+                    statement::For<1, hip_thread_y_loop, // d (reusing y)
+                      statement::Lambda<1, Segs<1,2,3>, Params<1>, Offsets<1,2,3>>
                     >
                   >,
                   statement::HipSyncThreads,
 
                   // Compute phi
-                  statement::ForICount<3, Param<6>, hip_thread_x_direct,  // z
-                    statement::ForICount<0, Param<3>, hip_thread_y_direct, // m
+                  statement::For<3, hip_thread_x_direct,  // z
+                    statement::For<0, hip_thread_y_direct, // m
 
                       // Compute thread-local Phi value and store
-                      statement::ForICount<1, Param<4>, seq_exec,  // d
-                        statement::Lambda<3>
+                      statement::For<1,  seq_exec,  // d
+                        statement::Lambda<3, Segs<0,1,2,3>, Params<0,1,2>, Offsets<0,1,2,3>>
                       > // d
                     >  // m
                   >,  // z
-
+                  
                   // finish tile over directions
                   statement::HipSyncThreads,
 
                   // Write out phi from thread local storage
-                  statement::ForICount<3, Param<6>, hip_thread_x_direct,  // z
-                    statement::ForICount<0, Param<3>, hip_thread_y_direct, // m
-                      statement::Lambda<4>
+                  statement::For<3, hip_thread_x_direct,  // z
+                    statement::For<0, hip_thread_y_direct, // m
+                      statement::Lambda<4, Segs<0,2,3>, Params<2>>
                     >
                   >,
                   statement::HipSyncThreads
-
+                
                 >  // Tile z
               >  // g
 
@@ -1154,13 +1219,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 
 
-
   RAJA::Timer timer;
   hipErrchk( hipDeviceSynchronize() );
   timer.start();
 
   RAJA::kernel_param<EXECPOL>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_m),
+      RAJA::make_tuple(
+      RAJA::TypedRangeSegment<IM>(0, num_m),
       RAJA::TypedRangeSegment<ID>(0, num_d),
       RAJA::TypedRangeSegment<IG>(0, num_g),
       RAJA::TypedRangeSegment<IZ>(0, num_z)),
@@ -1171,31 +1236,26 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     // computing a phi value, for shared memory before writing to phi array.
     RAJA::make_tuple( shmem_L,
                       shmem_psi,
-                      0.0,
-                      IM(0),
-                      ID(0),
-                      IG(0),
-                      IZ(0)),
+                      0.0),
 
     // Lambda<0> : Load L into shmem
-    [=] RAJA_DEVICE (IM m, ID d, IG g, IZ z,
-                     shmem_L_t& sh_L, shmem_psi_t&, double&,
-                     IM tm, ID td, IG, IZ) {
+    [=] RAJA_DEVICE (IM m, ID d,
+                     shmem_L_t& sh_L,
+                     IM tm, ID td) {
       sh_L(tm, td) = L(m, d);
     },
 
     // Lambda<1> : Load slice of psi into shmem
-    [=] RAJA_DEVICE (IM /*m*/, ID d, IG g, IZ z,
-                    shmem_L_t&, shmem_psi_t& sh_psi, double&,
-                     IM, ID td, IG tg, IZ tz) {
+    [=] RAJA_DEVICE (ID d, IG g, IZ z,
+                     shmem_psi_t& sh_psi,
+                     ID td, IG tg, IZ tz) {
 
       sh_psi(td, tg, tz) = psi(d, g, z);
     },
 
     // Lambda<2> : Load thread-local phi value
-    [=] RAJA_DEVICE (IM m, ID /*d*/, IG g, IZ z,
-                     shmem_L_t&, shmem_psi_t&, double& phi_local,
-                     IM, ID, IG, IZ) {
+    [=] RAJA_DEVICE (IM m, IG g, IZ z,
+                     double& phi_local) {
 
       phi_local = phi(m, g, z);
     },
@@ -1209,9 +1269,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     },
 
     // Lambda<4> : Store phi
-    [=] RAJA_DEVICE (IM m, ID /*d*/, IG g, IZ z,
-                     shmem_L_t&, shmem_psi_t&, double& phi_local,
-                     IM, ID, IG, IZ) {
+    [=] RAJA_DEVICE (IM m, IG g, IZ z,
+                     double& phi_local) {
 
       phi(m, g, z) = phi_local;
     }
diff --git a/examples/multiview.cpp b/examples/multiview.cpp
new file mode 100644
index 0000000000..65975fd144
--- /dev/null
+++ b/examples/multiview.cpp
@@ -0,0 +1,200 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "RAJA/RAJA.hpp"
+#include <cstdio>
+#include <array>
+
+/*
+ * MultiView Usage Example
+ *
+ * A RAJA::MultiView object wraps an array-of-pointers,
+ * or a pointer-to-pointers, whereas a RAJA::View wraps a single
+ * pointer or array. This allows a single RAJA::Layout to be applied to
+ * multiple arrays internal to the MultiView, allowing multiple arrays to share indexing
+ * arithmetic when their access patterns are the same.
+ * 
+ * The instantiation of a MultiView works exactly like a standard View,
+ * except that it takes an array-of-pointers. In the following example, a MultiView
+ * applies a 1-D layout of length 4 to 2 internal arrays in myarr:
+ *
+ *   // Arrays of the same size, which will become internal to the MultiView.
+ *   int a1[4] = {5,6,7,8};
+ *   int a2[4] = {9,10,11,12};
+ *
+ *   // Array-of-pointers which will be passed into MultiView.
+ *   int * myarr[2];
+ *   myarr[0] = a1;
+ *   myarr[1] = a2;
+ *
+ *   // This MultiView applies a 1-D layout of length 4 to each internal array in myarr.
+ *   RAJA::MultiView< int, RAJA::Layout<1> > MView(myarr, 4);
+ * 
+ * The default MultiView accesses internal arrays via the 0th index of the MultiView:
+ * 
+ *   MView( 0, 4 ); // accesses the 4th index of the 0th internal array a1, returns value of 8
+ *   MView( 1, 2 ); // accesses 2nd index of the 1st internal array a2, returns value of 10
+ * 
+ * The index into the array-of-pointers can be moved to different
+ * indices of the MultiView () access operator, rather than the default 0th index. By 
+ * passing a third template parameter to the MultiView constructor, the internal array index
+ * and the integer indicating which array to access can be reversed:
+ *
+ *   // MultiView with array-of-pointers index in 1st position
+ *   RAJA::MultiView< int, RAJA::Layout<1>, 1 > MView1(myarr, 4);
+ *
+ *   MView1( 4, 0 ); // accesses the 4th index of the 0th internal array a1, returns value of 8
+ *   MView1( 2, 1 ); // accesses 2nd index of the 1st internal array a2, returns value of 10
+ * 
+ * As the number of Layout dimensions increases, the index into the array-of-pointers can be
+ * moved to more distinct locations in the MultiView () access operator. Here is an example
+ * which compares the accesses of a 2-D layout on a normal RAJA::View with a RAJA::MultiView
+ * with the array-of-pointers index set to the 2nd position:
+ *  
+ *   RAJA::View< int, RAJA::Layout<2> > normalView(a1, 2, 2);
+ *
+ *   normalView( 2, 1 ); // accesses 3rd index of the a1 array, value = 7
+ *
+ *   // MultiView with array-of-pointers index in 2nd position
+ *   RAJA::MultiView< int, RAJA::Layout<2>, 2 > MView2(myarr, 2, 2);
+ *
+ *   MView2( 2, 1, 0 ); // accesses the 3rd index of the 0th internal array a1, returns value of 7 (same as normaView(2,1))
+ *   MView2( 2, 1, 1 ); // accesses the 3rd index of the 1st internal array a2, returns value of 11
+ *
+ * The following code demonstrates 2 aspects of RAJA::MultiView usage:
+ * - Basic usage
+ * - Moving of the array-of-pointers index
+ */
+
+void docs_example()
+{
+  // temporaries
+  int t1, t2, t3, t4;
+
+  printf( "MultiView Example from RAJA Documentation:\n" );
+
+  // _multiview_example_1Dinit_start
+  // Arrays of the same size, which will become internal to the MultiView.
+  int a1[4] = {5,6,7,8};
+  int a2[4] = {9,10,11,12};
+
+  // Array-of-pointers which will be passed into MultiView.
+  int * myarr[2];
+  myarr[0] = a1;
+  myarr[1] = a2;
+
+  // This MultiView applies a 1-D layout of length 4 to each internal array in myarr.
+  RAJA::MultiView< int, RAJA::Layout<1> > MView(myarr, 4);
+  // _multiview_example_1Dinit_end
+
+  // _multiview_example_1Daccess_start
+  t1 = MView( 0, 3 ); // accesses the 4th index of the 0th internal array a1, returns value of 8
+  t2 = MView( 1, 2 ); // accesses 3rd index of the 1st internal array a2, returns value of 11
+  // _multiview_example_1Daccess_end
+
+  // _multiview_example_1Daopindex_start
+  // MultiView with array-of-pointers index in 1st position.
+  RAJA::MultiView< int, RAJA::Layout<1>, 1 > MView1(myarr, 4);
+
+  t3 = MView1( 3, 0 ); // accesses the 4th index of the 0th internal array a1, returns value of 8
+  t4 = MView1( 2, 1 ); // accesses 3rd index of the 1st internal array a2, returns value of 11
+  // _multiview_example_1Daopindex_end
+
+  printf( "Comparison of default MultiView with another MultiView that has the array-of-pointers index in the 1st position of the () accessor:\n" );
+  printf( "MView( 0, 3 ) = %i, MView1( 3, 0 ) = %i\n", t1, t3 );
+  printf( "MView( 1, 2 ) = %i, MView1( 2, 1 ) = %i\n", t2, t4 );
+
+  // _multiview_example_2Daopindex_start
+  RAJA::View< int, RAJA::Layout<2> > normalView(a1, 2, 2);
+
+  t1 = normalView( 1, 1 ); // accesses 4th index of the a1 array, value = 8
+
+  // MultiView with array-of-pointers index in 2nd position
+  RAJA::MultiView< int, RAJA::Layout<2>, 2 > MView2(myarr, 2, 2);
+
+  t2 = MView2( 1, 1, 0 ); // accesses the 4th index of the 0th internal array a1, returns value of 8 (same as normalView(1,1))
+  t3 = MView2( 0, 0, 1 ); // accesses the 1st index of the 1st internal array a2, returns value of 9
+  // _multiview_example_2Daopindex_end
+
+  printf( "Comparison of 2D normal View with 2D MultiView that has the array-of-pointers index in the 2nd position of the () accessor:\n" );
+  printf( "normalView( 1, 1 ) = %i, MView2( 1, 1, 0 ) = %i\n", t1, t2 );
+}
+
+int main()
+{
+  docs_example();
+
+  constexpr int N = 12;
+  int * myarr[2]; // two 3x4 arrays
+  int arr1[N];
+  int arr2[N];
+
+  for ( int ii = 0; ii < N; ++ii )
+  {
+    arr1[ii] = 100 + ii;
+    arr2[ii] = 200 + ii;
+  }
+
+  myarr[0] = arr1;
+  myarr[1] = arr2;
+
+  // 4x3 layout
+  std::array<RAJA::idx_t, 2> perm { {0, 1} };
+  RAJA::Layout<2> layout = RAJA::make_permuted_layout(
+                              { {4, 3} }, perm
+                           );
+
+  // Basic MultiView usage
+  // Default usage: no specified array-of-pointers index moving
+  // 0th position is used as the array-of-pointers index
+  RAJA::MultiView<int, RAJA::Layout<2, RAJA::Index_type>> arrView(myarr, layout);
+
+  // Moved array-of-pointers index MultiView usage
+  // Add an array-of-pointers index specifier
+  constexpr int aopidx = 1;
+  RAJA::MultiView<int, RAJA::Layout<2, RAJA::Index_type>, aopidx> arrViewMov(myarr, layout);
+
+  // Comparing values of both views
+  printf ( "Comparing values of both default and 1-index-ed MultiViews:\n" );
+  for ( int pp = 0; pp < 2; ++pp )
+  {
+    for ( int kk = 0; kk < 4; ++kk )
+    {
+      for ( int jj = 0; jj < 3; ++jj )
+      {
+        printf ( "arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj, arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj) );
+      }
+    }
+  }
+
+  // switch values
+  printf ( "Switching values\n" );
+  for ( int kk = 0; kk < 4; ++kk )
+  {
+    for ( int jj = 0; jj < 3; ++jj )
+    {
+      int temp = arrView(0, kk, jj);
+      arrView(0, kk, jj) = arrView(1, kk, jj);
+      arrView(1, kk, jj) = temp;
+    }
+  }
+
+  // Comparing switched values of both views
+  printf ( "Comparing switched values of both default and 1-index-ed MultiViews:\n" );
+  for ( int pp = 0; pp < 2; ++pp )
+  {
+    for ( int kk = 0; kk < 4; ++kk )
+    {
+      for ( int jj = 0; jj < 3; ++jj )
+      {
+        printf ( "arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj, arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj) );
+      }
+    }
+  }
+
+  return 0;
+}
diff --git a/examples/plugin/CMakeLists.txt b/examples/plugin/CMakeLists.txt
index bbe173e26f..bb67edc4e6 100644
--- a/examples/plugin/CMakeLists.txt
+++ b/examples/plugin/CMakeLists.txt
@@ -8,3 +8,11 @@
 raja_add_executable(
   NAME plugin-example
   SOURCES test-plugin.cpp counter-plugin.cpp)
+
+raja_add_executable(
+  NAME plugin-example-dynamic
+  SOURCES test-plugin-dynamic.cpp)
+  
+raja_add_plugin_library(NAME timer_plugin
+                        SHARED TRUE
+                        SOURCES timer-plugin.cpp)
diff --git a/examples/plugin/counter-plugin.cpp b/examples/plugin/counter-plugin.cpp
index 146bc86d23..87b0bc13a2 100644
--- a/examples/plugin/counter-plugin.cpp
+++ b/examples/plugin/counter-plugin.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+// _plugin_example_start
 #include "RAJA/util/PluginStrategy.hpp"
 
 #include <iostream>
@@ -13,20 +14,37 @@ class CounterPlugin :
   public RAJA::util::PluginStrategy
 {
   public:
-  void preLaunch(RAJA::util::PluginContext p) {
+  void preCapture(const RAJA::util::PluginContext& p) override {
     if (p.platform == RAJA::Platform::host)
-      std::cout << " [CounterPlugin]: Launching host kernel for the " << ++host_counter << " time!" << std::endl;
+      std::cout << " [CounterPlugin]: Capturing host kernel for the " << ++host_capture_counter << " time!" << std::endl;
     else
-      std::cout << " [CounterPlugin]: Launching device kernel for the " << ++device_counter << " time!" << std::endl;
+      std::cout << " [CounterPlugin]: Capturing device kernel for the " << ++device_capture_counter << " time!" << std::endl;
   }
 
-  void postLaunch(RAJA::util::PluginContext RAJA_UNUSED_ARG(p)) {
+  void preLaunch(const RAJA::util::PluginContext& p) override {
+    if (p.platform == RAJA::Platform::host)
+    {
+      std::cout << " [CounterPlugin]: Launching host kernel for the " << ++host_launch_counter << " time!" << std::endl;
+    }
+    else
+    {
+      std::cout << " [CounterPlugin]: Launching device kernel for the " << ++device_launch_counter << " time!" << std::endl;
+    }
   }
 
   private:
-   int host_counter;
-   int device_counter;
+   int host_capture_counter;
+   int device_capture_counter;
+   int host_launch_counter;
+   int device_launch_counter;
 };
 
-// Regiser plugin with the PluginRegistry
-static RAJA::util::PluginRegistry::Add<CounterPlugin> P("counter-plugin", "Counter");
+// Statically loading plugin.
+static RAJA::util::PluginRegistry::add<CounterPlugin> P("Counter", "Counts number of kernel launches.");
+
+// Dynamically loading plugin.
+extern "C" RAJA::util::PluginStrategy *getPlugin ()
+{
+  return new CounterPlugin;
+}
+// _plugin_example_end
diff --git a/examples/plugin/test-plugin-dynamic.cpp b/examples/plugin/test-plugin-dynamic.cpp
new file mode 100644
index 0000000000..4e2cb202f4
--- /dev/null
+++ b/examples/plugin/test-plugin-dynamic.cpp
@@ -0,0 +1,22 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "RAJA/RAJA.hpp"
+#include <cstdlib>
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+  RAJA::util::init_plugins("../lib/libtimer_plugin.so");
+
+  double *a = new double[10];
+  for (int i = 0; i < 4; i++)
+  {
+    RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10), [=](int i) {
+      a[i] = 0;
+    });
+  }
+}
diff --git a/examples/plugin/timer-plugin.cpp b/examples/plugin/timer-plugin.cpp
new file mode 100644
index 0000000000..248a514df8
--- /dev/null
+++ b/examples/plugin/timer-plugin.cpp
@@ -0,0 +1,48 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "RAJA/util/PluginStrategy.hpp"
+
+#include <iostream>
+#include <chrono>
+
+class TimerPlugin : public RAJA::util::PluginStrategy
+{
+public:
+  void preLaunch(const RAJA::util::PluginContext& RAJA_UNUSED_ARG(p)) override
+  {
+    start_time = std::chrono::steady_clock::now();
+  }
+
+  void postLaunch(const RAJA::util::PluginContext& p) override
+  {
+    end_time = std::chrono::steady_clock::now();
+    double elapsedMs = std::chrono::duration<double, std::milli>(end_time - start_time).count();
+
+    if (p.platform == RAJA::Platform::host)
+    {
+      printf("[TimerPlugin]: Elapsed time of host kernel was %f ms\n", elapsedMs);
+    }
+    else
+    {
+      printf("[TimerPlugin]: Elapsed time of device kernel was %f ms\n", elapsedMs);
+    }
+  }
+
+private:
+  std::chrono::steady_clock::time_point start_time;
+  std::chrono::steady_clock::time_point end_time;
+};
+
+// Dynamically loading plugin.
+extern "C" RAJA::util::PluginStrategy *getPlugin()
+{
+  return new TimerPlugin;
+}
+
+// Statically loading plugin.
+static RAJA::util::PluginRegistry::add<TimerPlugin> P("Timer", "Prints elapsed time of kernel executions.");
\ No newline at end of file
diff --git a/examples/raja-teams.cpp b/examples/raja-teams.cpp
new file mode 100644
index 0000000000..870b176bed
--- /dev/null
+++ b/examples/raja-teams.cpp
@@ -0,0 +1,191 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "RAJA/RAJA.hpp"
+#include "camp/resource.hpp"
+
+
+/*
+ * RAJA Teams Example: Upper Triangular Pattern + Shared Memory
+ *
+ * Teams introduces hierarchal parallelism through the concept of
+ * teams and threads.  Computation is executed in a pre-defined grid
+ * composed of threads and grouped into teams. The teams model enables
+ * developers to express parallelism through loops over teams, and inner loops
+ * over threads. Team loops are executed in parallel and
+ * threads within a team should be treated as sub-parallel regions.
+ *
+ * Team shared memory is allocated between team and thread loops.
+ * Memory allocated within thread loops are thread private.
+ * The example below demonstrates composing an upper triangular
+ * loop pattern, and using shared memory.
+ *
+ */
+
+/*
+ * Define host/device launch policies
+ */
+using launch_policy = RAJA::expt::LaunchPolicy<
+#if defined(RAJA_ENABLE_OPENMP)
+    RAJA::expt::omp_launch_t
+#else
+    RAJA::expt::seq_launch_t
+#endif
+#if defined(RAJA_ENABLE_CUDA)
+    ,
+    RAJA::expt::cuda_launch_t<false>
+#endif
+#if defined(RAJA_ENABLE_HIP)
+    ,
+    RAJA::expt::hip_launch_t<false>
+#endif
+    >;
+
+/*
+ * Define team policies.
+ * Up to 3 dimension are supported: x,y,z
+ */
+using teams_x = RAJA::expt::LoopPolicy<
+#if defined(RAJA_ENABLE_OPENMP)
+                                       RAJA::omp_parallel_for_exec
+#else
+                                       RAJA::loop_exec
+#endif
+#if defined(RAJA_ENABLE_CUDA)
+                                       ,
+                                       RAJA::cuda_block_x_direct
+#endif
+#if defined(RAJA_ENABLE_HIP)
+                                       ,
+                                       RAJA::hip_block_x_direct
+#endif
+                                       >;
+/*
+ * Define thread policies.
+ * Up to 3 dimension are supported: x,y,z
+ */
+using threads_x = RAJA::expt::LoopPolicy<RAJA::loop_exec
+#if defined(RAJA_ENABLE_CUDA)
+                                         ,
+                                         RAJA::cuda_thread_x_loop
+#endif
+#if defined(RAJA_ENABLE_HIP)
+                                         ,
+                                         RAJA::hip_thread_x_loop
+#endif
+                                         >;
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  // Resource object for host
+  camp::resources::Host host_res;
+
+  // Resource objects for CUDA or HIP
+#if defined(RAJA_ENABLE_CUDA)
+  camp::resources::Cuda device_res;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+  camp::resources::Hip device_res;
+#endif
+
+  std::cout << "\n Running RAJA-Teams examples...\n";
+  int num_of_backends = 1;
+#if defined(RAJA_ENABLE_DEVICE)
+  num_of_backends++;
+#endif
+
+  // RAJA teams may switch between host and device policies at run time.
+  // The loop below will execute through the available backends.
+
+  for (int exec_place = 0; exec_place < num_of_backends; ++exec_place) {
+
+    RAJA::expt::ExecPlace select_cpu_or_gpu = (RAJA::expt::ExecPlace)exec_place;
+
+    // auto select_cpu_or_gpu = RAJA::HOST;
+    // auto select_cpu_or_gpu = RAJA::DEVICE;
+
+    // Allocate memory for either host or device
+    int N_tri = 5;
+
+    int *Ddat;
+    if (select_cpu_or_gpu == RAJA::expt::HOST)
+      Ddat = host_res.allocate<int>(N_tri * N_tri);
+
+#if defined(RAJA_ENABLE_DEVICE)
+    if (select_cpu_or_gpu == RAJA::expt::DEVICE)
+      Ddat = device_res.allocate<int>(N_tri * N_tri);
+#endif
+
+    /*
+     * RAJA::expt::launch just starts a "kernel" and doesn't provide any looping.
+     *
+     * The first argument determines which policy should be executed,
+     *
+     * The second argument is the number of teams+threads needed for each of the
+     * policies.
+     *
+     * Third argument is the lambda.
+     *
+     * The lambda takes a "resource" object, which has the teams+threads
+     * and is used to perform thread synchronizations within a team.
+     */
+
+    if (select_cpu_or_gpu == RAJA::expt::HOST){
+      std::cout << "\n Running Upper triangular pattern example on the host...\n";
+    }else {
+      std::cout << "\n Running Upper triangular pattern example on the device...\n";
+    }
+
+
+    RAJA::View<int, RAJA::Layout<2>> D(Ddat, N_tri, N_tri);
+
+    RAJA::expt::launch<launch_policy>(select_cpu_or_gpu,
+       RAJA::expt::Resources(RAJA::expt::Teams(N_tri), RAJA::expt::Threads(N_tri)),
+       [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
+
+         RAJA::expt::loop<teams_x>(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) {
+
+         // Array shared within threads of the same team
+         TEAM_SHARED int s_A[1];
+
+         RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) {
+            if (c == r) s_A[0] = r;
+            D(r, c) = r * N_tri + c;
+         });  // loop j
+
+         ctx.teamSync();
+
+         RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) {
+
+             printf("r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]);
+
+         });  // loop c
+         });  // loop r
+       });  // outer lambda
+
+    if (select_cpu_or_gpu == RAJA::expt::HOST) {
+      host_res.deallocate(Ddat);
+    }
+
+#if defined(RAJA_ENABLE_DEVICE)
+    if (select_cpu_or_gpu == RAJA::expt::DEVICE) {
+      device_res.deallocate(Ddat);
+    }
+#endif
+
+  }  // Execution places loop
+
+
+}  // Main
diff --git a/examples/red-black-gauss-seidel.cpp b/examples/red-black-gauss-seidel.cpp
index 9eb6401a35..6d24cee1fe 100644
--- a/examples/red-black-gauss-seidel.cpp
+++ b/examples/red-black-gauss-seidel.cpp
@@ -14,7 +14,7 @@
 
 #include "RAJA/RAJA.hpp"
 
-#include "memoryManager.hpp"
+#include "camp/resource.hpp"
 
 /*
  * Gauss-Seidel with Red-Black Ordering Example
@@ -65,7 +65,8 @@ struct grid_s {
 */
 double solution(double x, double y);
 void computeErr(double *I, grid_s grid);
-RAJA::TypedIndexSet<RAJA::ListSegment> gsColorPolicy(int N);
+RAJA::TypedIndexSet<RAJA::ListSegment> 
+  gsColorPolicy(int N, camp::resources::Resource& res);
 
 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
@@ -97,11 +98,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   gridx.h = 1.0 / (N + 1.0);
   gridx.n = N + 2;
 
-  double *I = memoryManager::allocate<double>(NN);
+  camp::resources::Resource resource{camp::resources::Host()};
+
+  double *I = resource.allocate<double>(NN);
 
   memset(I, 0, NN * sizeof(double));
 
-  RAJA::TypedIndexSet<RAJA::ListSegment> colorSet = gsColorPolicy(N);
+  RAJA::TypedIndexSet<RAJA::ListSegment> colorSet = gsColorPolicy(N, resource);
 
   memset(I, 0, NN * sizeof(double));
 
@@ -160,8 +163,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   computeErr(I, gridx);
   printf("No of iterations: %d \n \n", iteration);
 
-
-  memoryManager::deallocate(I);
+  resource.deallocate(I);
 
   return 0;
 }
@@ -172,9 +174,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 //  to generate RAJA ListSegments and populate a RAJA Static Index
 //  Set.
 
-RAJA::TypedIndexSet<RAJA::ListSegment> gsColorPolicy(int N)
+RAJA::TypedIndexSet<RAJA::ListSegment> 
+  gsColorPolicy(int N, camp::resources::Resource& res)
 {
-
   RAJA::TypedIndexSet<RAJA::ListSegment> colorSet;
 
   int redN = ceil(N * N / 2);
@@ -205,8 +207,8 @@ RAJA::TypedIndexSet<RAJA::ListSegment> gsColorPolicy(int N)
   }
 
   // Create Index
-  colorSet.push_back(RAJA::ListSegment(Blk, blkN));
-  colorSet.push_back(RAJA::ListSegment(Red, redN));
+  colorSet.push_back(RAJA::ListSegment(Blk, blkN, res));
+  colorSet.push_back(RAJA::ListSegment(Red, redN, res));
   delete[] Blk;
   delete[] Red;
 
diff --git a/examples/resource-forall.cpp b/examples/resource-forall.cpp
new file mode 100644
index 0000000000..a83ac7264c
--- /dev/null
+++ b/examples/resource-forall.cpp
@@ -0,0 +1,376 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA/util/resource.hpp"
+
+/*
+ *  Vector Addition Example
+ *
+ *  Computes c = a + b, where a, b, c are vectors of ints.
+ *  It illustrates similarities between a  C-style for-loop and a RAJA 
+ *  forall loop.
+ *
+ *  RAJA features shown:
+ *    - `forall` loop iteration template method
+ *    -  Index range segment
+ *    -  Execution policies
+ *    -  `forall` with Resource argument
+ *    -  Cuda/Hip streams w/ Resource
+ *    -  Resources events
+ *
+ */
+
+
+//
+// Functions for checking and printing results
+//
+void checkResult(int* res, int len); 
+void printResult(int* res, int len);
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA vector addition example...\n";
+
+//
+// Define vector length
+//
+  const int N = 100000;
+
+//
+// Allocate and initialize vector data
+//
+  RAJA::resources::Host host{};
+
+  int *a = host.allocate<int>(N);
+  int *b = host.allocate<int>(N);
+  int *c = host.allocate<int>(N);
+
+  int *a_ = host.allocate<int>(N);
+  int *b_ = host.allocate<int>(N);
+  int *c_ = host.allocate<int>(N);
+
+
+  for (int i = 0; i < N; ++i) {
+    a[i] = -i;
+    b[i] = 2 * i;
+    a_[i] = -i;
+    b_[i] = 2 * i;
+
+  }
+
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style vector addition...\n";
+
+  for (int i = 0; i < N; ++i) {
+    c[i] = a[i] + b[i];
+  }
+
+  checkResult(c, N);
+
+
+//----------------------------------------------------------------------------//
+// RAJA::seq_exec policy enforces strictly sequential execution.... 
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA sequential vector addition...\n";
+
+
+  RAJA::forall<RAJA::seq_exec>(host, RAJA::RangeSegment(0, N), [=] (int i) { 
+    c[i] = a[i] + b[i]; 
+  });
+
+  checkResult(c, N);
+
+//----------------------------------------------------------------------------//
+// RAJA::loop_exec policy enforces loop execution.... 
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA loop vector addition...\n";
+
+  RAJA::forall<RAJA::loop_exec>(host, RAJA::RangeSegment(0, N), [=] (int i) { 
+    c[i] = a[i] + b[i]; 
+  });
+
+  checkResult(c, N);
+
+//----------------------------------------------------------------------------//
+// RAJA::sind_exec policy enforces simd execution.... 
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA simd_exec vector addition...\n";
+
+  RAJA::forall<RAJA::simd_exec>(host, RAJA::RangeSegment(0, N), [=] (int i) { 
+    c[i] = a[i] + b[i]; 
+  });
+
+  checkResult(c, N);
+
+#if defined(RAJA_ENABLE_OPENMP)
+//----------------------------------------------------------------------------//
+// RAJA::omp_for_parallel_exec policy execution.... 
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA omp_parallel<seq_exec> vector addition...\n";
+
+  RAJA::forall<RAJA::omp_parallel_exec<RAJA::seq_exec>>(host, RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { 
+    c[i] = a[i] + b[i]; 
+  });
+
+  checkResult(c, N);
+
+//----------------------------------------------------------------------------//
+// RAJA::omp_for_nowait_exec policy execution.... 
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA omp_for_nowait vector addition...\n";
+
+  RAJA::forall<RAJA::omp_for_nowait_exec>(host, RAJA::RangeSegment(0, N), [=] (int i) { 
+    c[i] = a[i] + b[i]; 
+  });
+
+  checkResult(c, N);
+
+//----------------------------------------------------------------------------//
+// RAJA::omp_for_exec policy execution.... 
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA omp_for_exec vector addition...\n";
+
+  RAJA::forall<RAJA::omp_for_exec>(host, RAJA::RangeSegment(0, N), [=] (int i) { 
+    c[i] = a[i] + b[i]; 
+  });
+
+  checkResult(c, N);
+#endif
+
+#if defined(RAJA_ENABLE_TBB)
+//----------------------------------------------------------------------------//
+// RAJA::tbb_for_dynamic policy execution.... 
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA tbb_for_dynamic vector addition...\n";
+
+  RAJA::forall<RAJA::tbb_for_dynamic>(host, RAJA::RangeSegment(0, N), [=] (int i) { 
+    c[i] = a[i] + b[i]; 
+  });
+
+  checkResult(c, N);
+
+//----------------------------------------------------------------------------//
+// RAJA::tbb_for_static policy execution.... 
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA tbb_for_static<8> vector addition...\n";
+
+  RAJA::forall<RAJA::tbb_for_static<8>>(host, RAJA::RangeSegment(0, N), [=] (int i) { 
+    c[i] = a[i] + b[i]; 
+  });
+
+  checkResult(c, N);
+#endif
+
+
+
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+
+/*
+  GPU_BLOCK_SIZE - specifies the number of threads in a CUDA/HIP thread block
+*/
+const int GPU_BLOCK_SIZE = 256;
+
+//----------------------------------------------------------------------------//
+// RAJA::cuda/hip_exec policy execution.... 
+//----------------------------------------------------------------------------//
+{
+  std::cout << "\n Running RAJA GPU vector addition on 2 seperate streams...\n";
+#if defined(RAJA_ENABLE_CUDA)
+  RAJA::resources::Cuda res_gpu1;
+  RAJA::resources::Cuda res_gpu2;
+  using EXEC_POLICY = RAJA::cuda_exec_async<GPU_BLOCK_SIZE>;
+#elif defined(RAJA_ENABLE_HIP)
+  RAJA::resources::Hip res_gpu1;
+  RAJA::resources::Hip res_gpu2;
+  using EXEC_POLICY = RAJA::hip_exec_async<GPU_BLOCK_SIZE>;
+#endif
+
+  int* d_a1 = res_gpu1.allocate<int>(N);
+  int* d_b1 = res_gpu1.allocate<int>(N);
+  int* d_c1 = res_gpu1.allocate<int>(N);
+
+  int* d_a2 = res_gpu2.allocate<int>(N);
+  int* d_b2 = res_gpu2.allocate<int>(N);
+  int* d_c2 = res_gpu2.allocate<int>(N);
+
+  res_gpu1.memcpy(d_a1, a, sizeof(int)* N);
+  res_gpu1.memcpy(d_b1, b, sizeof(int)* N);
+
+  res_gpu2.memcpy(d_a2, a, sizeof(int)* N);
+  res_gpu2.memcpy(d_b2, b, sizeof(int)* N);
+
+
+  RAJA::forall<EXEC_POLICY>(res_gpu1, RAJA::RangeSegment(0, N), 
+    [=] RAJA_DEVICE (int i) { 
+    d_c1[i] = d_a1[i] + d_b1[i]; 
+  });    
+
+  RAJA::forall<EXEC_POLICY>(res_gpu2, RAJA::RangeSegment(0, N), 
+    [=] RAJA_DEVICE (int i) { 
+    d_c2[i] = d_a2[i] + d_b2[i]; 
+  }); 
+
+  res_gpu1.memcpy(c, d_c1, sizeof(int)*N );
+
+  res_gpu2.memcpy(c_, d_c2, sizeof(int)*N );
+
+  checkResult(c, N);
+  checkResult(c_, N);
+
+  res_gpu1.deallocate(d_a1);
+  res_gpu1.deallocate(d_b1);
+  res_gpu1.deallocate(d_c1);
+
+  res_gpu2.deallocate(d_a2);
+  res_gpu2.deallocate(d_b2);
+  res_gpu2.deallocate(d_c2);
+}
+
+
+//----------------------------------------------------------------------------//
+// RAJA::cuda/hip_exec policy with waiting event.... 
+//----------------------------------------------------------------------------//
+{
+  std::cout << "\n Running RAJA GPU vector with dependency between two seperate streams...\n";
+#if defined(RAJA_ENABLE_CUDA)
+  // _raja_res_defres_start
+  RAJA::resources::Cuda res_gpu1;
+  RAJA::resources::Cuda res_gpu2;
+  RAJA::resources::Host res_host;
+
+  using EXEC_POLICY = RAJA::cuda_exec_async<GPU_BLOCK_SIZE>;
+  // _raja_res_defres_end
+#elif defined(RAJA_ENABLE_HIP)
+  RAJA::resources::Hip res_gpu1;
+  RAJA::resources::Hip res_gpu2;
+  RAJA::resources::Host res_host;
+
+  using EXEC_POLICY = RAJA::hip_exec_async<GPU_BLOCK_SIZE>;
+#endif
+
+  // _raja_res_alloc_start
+  int* d_array1 = res_gpu1.allocate<int>(N);
+  int* d_array2 = res_gpu2.allocate<int>(N);
+  int* h_array  = res_host.allocate<int>(N);
+  // _raja_res_alloc_end
+
+  // _raja_res_k1_start
+  RAJA::forall<EXEC_POLICY>(res_gpu1, RAJA::RangeSegment(0,N),
+    [=] RAJA_HOST_DEVICE (int i) {
+      d_array1[i] = i;
+    }
+  );
+  // _raja_res_k1_end
+
+  // _raja_res_k2_start
+  RAJA::resources::Event e = RAJA::forall<EXEC_POLICY>(res_gpu2, RAJA::RangeSegment(0,N),
+    [=] RAJA_HOST_DEVICE (int i) {
+      d_array2[i] = -1;
+    }
+  );
+  // _raja_res_k2_end
+
+  // _raja_res_wait_start
+  res_gpu2.wait_for(&e);
+  // _raja_res_wait_end
+
+  // _raja_res_k3_start
+  RAJA::forall<EXEC_POLICY>(res_gpu1, RAJA::RangeSegment(0,N),
+    [=] RAJA_HOST_DEVICE (int i) {
+      d_array1[i] *= d_array2[i];
+    }
+  );
+  // _raja_res_k3_end
+
+  // _raja_res_memcpy_start
+  res_gpu1.memcpy(h_array, d_array1, sizeof(int) * N);
+  // _raja_res_memcpy_end
+
+  // _raja_res_k4_start
+  bool check = true;
+  RAJA::forall<RAJA::seq_exec>(res_host, RAJA::RangeSegment(0,N),
+    [&check, h_array] (int i) {
+      if(h_array[i] != -i) {check = false;} 
+    }
+  );
+  // _raja_res_k4_end
+  
+  std::cout << "\n         result -- ";
+  if (check) std::cout << "PASS\n";
+  else std::cout << "FAIL\n";
+
+  res_gpu1.deallocate(d_array1);
+  res_gpu2.deallocate(d_array2);
+  res_host.deallocate(h_array);
+
+}
+
+#endif
+//
+//
+// Clean up.
+//
+  host.deallocate(a);
+  host.deallocate(b);
+  host.deallocate(c);
+
+  host.deallocate(a_);
+  host.deallocate(b_);
+  host.deallocate(c_);
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
+//
+// Function to check result and report P/F.
+//
+void checkResult(int* res, int len) 
+{
+  bool correct = true;
+  for (int i = 0; i < len; i++) {
+    if ( res[i] != i ) { correct = false; }
+  }
+  if ( correct ) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+}
+
+//
+// Function to print result.
+//
+void printResult(int* res, int len)
+{
+  std::cout << std::endl;
+  for (int i = 0; i < len; i++) {
+    std::cout << "result[" << i << "] = " << res[i] << std::endl;
+  }
+  std::cout << std::endl;
+}
+
diff --git a/examples/tut_dot-product.cpp b/examples/tut_dot-product.cpp
index a1caab4ac5..a3e853697e 100644
--- a/examples/tut_dot-product.cpp
+++ b/examples/tut_dot-product.cpp
@@ -59,8 +59,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 //
 // Allocate and initialize vector data
 //
-  int *a = memoryManager::allocate<int>(N);
-  int *b = memoryManager::allocate<int>(N);
+  double *a = memoryManager::allocate<double>(N);
+  double *b = memoryManager::allocate<double>(N);
 
   for (int i = 0; i < N; ++i) {
     a[i] = 1.0;
diff --git a/examples/tut_halo-exchange.cpp b/examples/tut_halo-exchange.cpp
new file mode 100644
index 0000000000..bd7b823e76
--- /dev/null
+++ b/examples/tut_halo-exchange.cpp
@@ -0,0 +1,1871 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <new>
+#include <limits>
+#include <vector>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA/util/Timer.hpp"
+
+/*
+ *  Halo exchange Example
+ *
+ *  Packs and Unpacks data from 3D variables as is done in a halo exchange.
+ *  It illustrates how to use the workgroup set of constructs.
+ *
+ *  RAJA features shown:
+ *    - `WorkPool` template object
+ *    - `WorkGroup` template object
+ *    - `WorkSite` template object
+ *    -  Index range segment
+ *    -  WorkGroup policies
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+/*
+  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block when using forall
+  CUDA_WORKGROUP_BLOCK_SIZE - specifies the number of threads in a CUDA thread block when using workgroup
+*/
+#if defined(RAJA_ENABLE_CUDA)
+const int CUDA_BLOCK_SIZE = 256;
+const int CUDA_WORKGROUP_BLOCK_SIZE = 1024;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+const int HIP_BLOCK_SIZE = 256;
+const int HIP_WORKGROUP_BLOCK_SIZE = 1024;
+#endif
+
+/*
+  num_neighbors - specifies the number of neighbors that each process would be
+                  communicating with in 3D halo exchange
+*/
+const int num_neighbors = 26;
+
+//
+// Functions for checking and printing results
+//
+void checkResult(std::vector<double*> const& vars, std::vector<double*> const& vars_ref,
+                 int var_size, int num_vars);
+void printResult(std::vector<double*> const& vars, int var_size, int num_vars);
+
+//
+// Functions for allocating and populating packing and unpacking lists
+//
+void create_pack_lists(std::vector<int*>& pack_index_lists, std::vector<int>& pack_index_list_lengths,
+                       const int halo_width, const int* grid_dims);
+void create_unpack_lists(std::vector<int*>& unpack_index_lists, std::vector<int>& unpack_index_list_lengths,
+                         const int halo_width, const int* grid_dims);
+void destroy_pack_lists(std::vector<int*>& pack_index_lists);
+void destroy_unpack_lists(std::vector<int*>& unpack_index_lists);
+
+
+template < typename T >
+struct memory_manager_allocator
+{
+  using value_type = T;
+
+  memory_manager_allocator() = default;
+
+  template < typename U >
+  constexpr memory_manager_allocator(memory_manager_allocator<U> const&) noexcept
+  { }
+
+  /*[[nodiscard]]*/
+  value_type* allocate(size_t num)
+  {
+    if (num > std::numeric_limits<size_t>::max() / sizeof(value_type)) {
+      throw std::bad_alloc();
+    }
+
+    value_type *ptr = memoryManager::allocate<value_type>(num);
+
+    if (!ptr) {
+      throw std::bad_alloc();
+    }
+
+    return ptr;
+  }
+
+  void deallocate(value_type* ptr, size_t) noexcept
+  {
+    value_type* ptrc = static_cast<value_type*>(ptr);
+    memoryManager::deallocate(ptrc);
+  }
+};
+
+template <typename T, typename U, typename Resource>
+bool operator==(memory_manager_allocator<T> const&, memory_manager_allocator<U> const&)
+{
+  return true;
+}
+
+template <typename T, typename U, typename Resource>
+bool operator!=(memory_manager_allocator<T> const& lhs, memory_manager_allocator<U> const& rhs)
+{
+  return !(lhs == rhs);
+}
+
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+
+template < typename T >
+struct pinned_allocator
+{
+  using value_type = T;
+
+  pinned_allocator() = default;
+
+  template < typename U >
+  constexpr pinned_allocator(pinned_allocator<U> const&) noexcept
+  { }
+
+  /*[[nodiscard]]*/
+  value_type* allocate(size_t num)
+  {
+    if (num > std::numeric_limits<size_t>::max() / sizeof(value_type)) {
+      throw std::bad_alloc();
+    }
+
+    value_type *ptr = nullptr;
+#if defined(RAJA_ENABLE_CUDA)
+    cudaErrchk(cudaMallocHost((void **)&ptr, num*sizeof(value_type)));
+#elif defined(RAJA_ENABLE_HIP)
+    hipErrchk(hipHostMalloc((void **)&ptr, num*sizeof(value_type)));
+#endif
+
+    if (!ptr) {
+      throw std::bad_alloc();
+    }
+
+    return ptr;
+  }
+
+  void deallocate(value_type* ptr, size_t) noexcept
+  {
+#if defined(RAJA_ENABLE_CUDA)
+    cudaErrchk(cudaFreeHost(ptr));
+#elif defined(RAJA_ENABLE_HIP)
+    hipErrchk(hipHostFree(ptr));
+#endif
+  }
+};
+
+template <typename T, typename U, typename Resource>
+bool operator==(pinned_allocator<T> const&, pinned_allocator<U> const&)
+{
+  return true;
+}
+
+template <typename T, typename U, typename Resource>
+bool operator!=(pinned_allocator<T> const& lhs, pinned_allocator<U> const& rhs)
+{
+  return !(lhs == rhs);
+}
+
+
+#endif
+
+int main(int argc, char **argv)
+{
+
+  std::cout << "\n\nRAJA halo exchange example...\n";
+
+  if (argc != 1 && argc != 7) {
+    std::cerr << "Usage: tut_halo-exchange "
+              << "[grid_x grid_y grid_z halo_width num_vars num_cycles]\n";
+    std::exit(1);
+  }
+
+  // _halo_exchange_input_params_start
+  //
+  // Define grid dimensions
+  // Define halo width
+  // Define number of grid variables
+  // Define number of cycles
+  //
+  const int grid_dims[3] = { (argc != 7) ? 100 : std::atoi(argv[1]),
+                             (argc != 7) ? 100 : std::atoi(argv[2]),
+                             (argc != 7) ? 100 : std::atoi(argv[3]) };
+  const int halo_width =     (argc != 7) ?   1 : std::atoi(argv[4]);
+  const int num_vars   =     (argc != 7) ?   3 : std::atoi(argv[5]);
+  const int num_cycles =     (argc != 7) ?   3 : std::atoi(argv[6]);
+  // _halo_exchange_input_params_end
+
+  std::cout << "grid dimensions "     << grid_dims[0]
+            << " x "                  << grid_dims[1]
+            << " x "                  << grid_dims[2] << "\n"
+            << "halo width "          << halo_width   << "\n"
+            << "number of variables " << num_vars     << "\n"
+            << "number of cycles "    << num_cycles   << "\n";
+
+  if ( grid_dims[0] < halo_width ||
+       grid_dims[1] < halo_width ||
+       grid_dims[2] < halo_width ) {
+    std::cerr << "Error: "
+              << "grid dimensions must not be smaller than the halo width\n";
+    std::exit(1);
+  }
+
+  const int grid_plus_halo_dims[3] = { grid_dims[0] + 2*halo_width,
+                                       grid_dims[1] + 2*halo_width,
+                                       grid_dims[2] + 2*halo_width };
+
+  const int var_size = grid_plus_halo_dims[0] *
+                       grid_plus_halo_dims[1] *
+                       grid_plus_halo_dims[2] ;
+
+  // _halo_exchange_vars_allocate_start
+  //
+  // Allocate grid variables and reference grid variables used to check
+  // correctness.
+  //
+  std::vector<double*> vars    (num_vars, nullptr);
+  std::vector<double*> vars_ref(num_vars, nullptr);
+
+  for (int v = 0; v < num_vars; ++v) {
+    vars[v]     = memoryManager::allocate<double>(var_size);
+    vars_ref[v] = memoryManager::allocate<double>(var_size);
+  }
+  // _halo_exchange_vars_allocate_end
+
+
+  // _halo_exchange_index_list_generate_start
+  //
+  // Generate index lists for packing and unpacking
+  //
+  std::vector<int*> pack_index_lists(num_neighbors, nullptr);
+  std::vector<int > pack_index_list_lengths(num_neighbors, 0);
+  create_pack_lists(pack_index_lists, pack_index_list_lengths, halo_width, grid_dims);
+
+  std::vector<int*> unpack_index_lists(num_neighbors, nullptr);
+  std::vector<int > unpack_index_list_lengths(num_neighbors, 0);
+  create_unpack_lists(unpack_index_lists, unpack_index_list_lengths, halo_width, grid_dims);
+  // _halo_exchange_index_lisgeneratete_end
+
+
+  //
+  // Convenience type alias to reduce typing
+  //
+  using range_segment = RAJA::TypedRangeSegment<int>;
+
+
+  auto timer = RAJA::Timer();
+
+
+//----------------------------------------------------------------------------//
+  {
+    std::cout << "\n Running C-style halo exchange...\n";
+
+    double minCycle = std::numeric_limits<double>::max();
+
+
+    std::vector<double*> buffers(num_neighbors, nullptr);
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      int buffer_len = num_vars * pack_index_list_lengths[l];
+
+      buffers[l] = memoryManager::allocate<double>(buffer_len);
+
+    }
+
+    for (int c = 0; c < num_cycles; ++c ) {
+      timer.start();
+      {
+
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
+
+        double* var = vars[v];
+
+        for (int i = 0; i < var_size; i++) {
+          var[i] = i + v;
+        }
+      }
+
+      // _halo_exchange_sequential_cstyle_packing_start
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
+
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          for (int i = 0; i < len; i++) {
+            buffer[i] = var[list[i]];
+          }
+
+          buffer += len;
+        }
+
+        // send single message
+      }
+      // _halo_exchange_sequential_cstyle_packing_end
+
+      // _halo_exchange_sequential_cstyle_packing_start
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        // recv single message
+
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
+
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          for (int i = 0; i < len; i++) {
+            var[list[i]] = buffer[i];
+          }
+
+          buffer += len;
+        }
+      }
+      // _halo_exchange_sequential_cstyle_unpacking_end
+
+      }
+      timer.stop();
+
+      RAJA::Timer::ElapsedType tCycle = timer.elapsed();
+      if (tCycle < minCycle) minCycle = tCycle;
+      timer.reset();
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      memoryManager::deallocate(buffers[l]);
+
+    }
+
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+
+    // copy result of exchange for reference later
+    for (int v = 0; v < num_vars; ++v) {
+
+      double* var     = vars[v];
+      double* var_ref = vars_ref[v];
+
+      for (int i = 0; i < var_size; i++) {
+        var_ref[i] = var[i];
+      }
+    }
+  }
+
+
+//----------------------------------------------------------------------------//
+// Separate packing/unpacking loops using forall
+//----------------------------------------------------------------------------//
+  {
+    std::cout << "\n Running RAJA loop forall halo exchange...\n";
+
+    double minCycle = std::numeric_limits<double>::max();
+
+    // _halo_exchange_loop_forall_policies_start
+    using forall_policy = RAJA::loop_exec;
+    // _halo_exchange_loop_forall_policies_end
+
+    std::vector<double*> buffers(num_neighbors, nullptr);
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      int buffer_len = num_vars * pack_index_list_lengths[l];
+
+      buffers[l] = memoryManager::allocate<double>(buffer_len);
+
+    }
+
+    for (int c = 0; c < num_cycles; ++c ) {
+      timer.start();
+      {
+
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
+
+        double* var = vars[v];
+
+        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] (int i) {
+          var[i] = i + v;
+        });
+      }
+
+      // _halo_exchange_loop_forall_packing_start
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
+
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          RAJA::forall<forall_policy>(range_segment(0, len), [=] (int i) {
+            buffer[i] = var[list[i]];
+          });
+
+          buffer += len;
+        }
+
+        // send single message
+      }
+      // _halo_exchange_loop_forall_packing_end
+
+      // _halo_exchange_loop_forall_unpacking_start
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        // recv single message
+
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
+
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          RAJA::forall<forall_policy>(range_segment(0, len), [=] (int i) {
+            var[list[i]] = buffer[i];
+          });
+
+          buffer += len;
+        }
+      }
+      // _halo_exchange_loop_forall_unpacking_end
+
+      }
+      timer.stop();
+
+      RAJA::Timer::ElapsedType tCycle = timer.elapsed();
+      if (tCycle < minCycle) minCycle = tCycle;
+      timer.reset();
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      memoryManager::deallocate(buffers[l]);
+
+    }
+
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+
+    // check results against reference copy
+    checkResult(vars, vars_ref, var_size, num_vars);
+    //printResult(vars, var_size, num_vars);
+  }
+
+
+//----------------------------------------------------------------------------//
+// RAJA::WorkGroup with allows deferred execution
+// This has overhead and indirection not in the separate loop version,
+// but can be useful for debugging.
+//----------------------------------------------------------------------------//
+  {
+  std::cout << "\n Running RAJA loop workgroup halo exchange...\n";
+
+    double minCycle = std::numeric_limits<double>::max();
+
+    // _halo_exchange_loop_workgroup_policies_start
+    using forall_policy = RAJA::loop_exec;
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::loop_work,
+                                 RAJA::ordered,
+                                 RAJA::ragged_array_of_objects >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     memory_manager_allocator<char> >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       int,
+                                       RAJA::xargs<>,
+                                       memory_manager_allocator<char> >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     memory_manager_allocator<char> >;
+    // _halo_exchange_loop_workgroup_policies_end
+
+    std::vector<double*> buffers(num_neighbors, nullptr);
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      int buffer_len = num_vars * pack_index_list_lengths[l];
+
+      buffers[l] = memoryManager::allocate<double>(buffer_len);
+
+    }
+
+    workpool pool_pack  (memory_manager_allocator<char>{});
+    workpool pool_unpack(memory_manager_allocator<char>{});
+
+    for (int c = 0; c < num_cycles; ++c ) {
+      timer.start();
+      {
+
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
+
+        double* var = vars[v];
+
+        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] (int i) {
+          var[i] = i + v;
+        });
+      }
+
+      // _halo_exchange_loop_workgroup_packing_start
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
+
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          pool_pack.enqueue(range_segment(0, len), [=] (int i) {
+            buffer[i] = var[list[i]];
+          });
+
+          buffer += len;
+        }
+      }
+
+      workgroup group_pack = pool_pack.instantiate();
+
+      worksite site_pack = group_pack.run();
+
+      // send all messages
+      // _halo_exchange_loop_workgroup_packing_end
+
+      // _halo_exchange_loop_workgroup_unpacking_start
+      // recv all messages
+
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
+
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          pool_unpack.enqueue(range_segment(0, len), [=] (int i) {
+            var[list[i]] = buffer[i];
+          });
+
+          buffer += len;
+        }
+      }
+
+      workgroup group_unpack = pool_unpack.instantiate();
+
+      worksite site_unpack = group_unpack.run();
+      // _halo_exchange_loop_workgroup_unpacking_end
+
+      }
+      timer.stop();
+
+      RAJA::Timer::ElapsedType tCycle = timer.elapsed();
+      if (tCycle < minCycle) minCycle = tCycle;
+      timer.reset();
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      memoryManager::deallocate(buffers[l]);
+
+    }
+
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+
+    // check results against reference copy
+    checkResult(vars, vars_ref, var_size, num_vars);
+    //printResult(vars, var_size, num_vars);
+  }
+
+
+//----------------------------------------------------------------------------//
+
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+//----------------------------------------------------------------------------//
+// Separate packing/unpacking loops using forall
+//----------------------------------------------------------------------------//
+  {
+    std::cout << "\n Running RAJA Openmp forall halo exchange...\n";
+
+    double minCycle = std::numeric_limits<double>::max();
+
+    // _halo_exchange_openmp_forall_policies_start
+    using forall_policy = RAJA::omp_parallel_for_exec;
+    // _halo_exchange_openmp_forall_policies_end
+
+    std::vector<double*> buffers(num_neighbors, nullptr);
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      int buffer_len = num_vars * pack_index_list_lengths[l];
+
+      buffers[l] = memoryManager::allocate<double>(buffer_len);
+
+    }
+
+    for (int c = 0; c < num_cycles; ++c ) {
+      timer.start();
+      {
+
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
+
+        double* var = vars[v];
+
+        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] (int i) {
+          var[i] = i + v;
+        });
+      }
+
+      // _halo_exchange_openmp_forall_packing_start
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
+
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          RAJA::forall<forall_policy>(range_segment(0, len), [=] (int i) {
+            buffer[i] = var[list[i]];
+          });
+
+          buffer += len;
+        }
+
+        // send single message
+      }
+      // _halo_exchange_openmp_forall_packing_end
+
+      // _halo_exchange_openmp_forall_unpacking_start
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        // recv single message
+
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
+
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          RAJA::forall<forall_policy>(range_segment(0, len), [=] (int i) {
+            var[list[i]] = buffer[i];
+          });
+
+          buffer += len;
+        }
+      }
+      // _halo_exchange_openmp_forall_unpacking_end
+
+      }
+      timer.stop();
+
+      RAJA::Timer::ElapsedType tCycle = timer.elapsed();
+      if (tCycle < minCycle) minCycle = tCycle;
+      timer.reset();
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      memoryManager::deallocate(buffers[l]);
+
+    }
+
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+
+    // check results against reference copy
+    checkResult(vars, vars_ref, var_size, num_vars);
+    //printResult(vars, var_size, num_vars);
+  }
+
+
+//----------------------------------------------------------------------------//
+// RAJA::WorkGroup may allow effective parallelism across loops with Openmp.
+//----------------------------------------------------------------------------//
+  {
+    std::cout << "\n Running RAJA OpenMP workgroup halo exchange...\n";
+
+    double minCycle = std::numeric_limits<double>::max();
+
+    // _halo_exchange_openmp_workgroup_policies_start
+    using forall_policy = RAJA::omp_parallel_for_exec;
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::omp_work,
+                                 RAJA::ordered,
+                                 RAJA::ragged_array_of_objects >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     memory_manager_allocator<char> >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       int,
+                                       RAJA::xargs<>,
+                                       memory_manager_allocator<char> >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     memory_manager_allocator<char> >;
+    // _halo_exchange_openmp_workgroup_policies_end
+
+    std::vector<double*> buffers(num_neighbors, nullptr);
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      int buffer_len = num_vars * pack_index_list_lengths[l];
+
+      buffers[l] = memoryManager::allocate<double>(buffer_len);
+
+    }
+
+    workpool pool_pack  (memory_manager_allocator<char>{});
+    workpool pool_unpack(memory_manager_allocator<char>{});
+
+    for (int c = 0; c < num_cycles; ++c ) {
+      timer.start();
+      {
+
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
+
+        double* var = vars[v];
+
+        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] (int i) {
+          var[i] = i + v;
+        });
+      }
+
+      // _halo_exchange_openmp_workgroup_packing_start
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
+
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          pool_pack.enqueue(range_segment(0, len), [=] (int i) {
+            buffer[i] = var[list[i]];
+          });
+
+          buffer += len;
+        }
+      }
+
+      workgroup group_pack = pool_pack.instantiate();
+
+      worksite site_pack = group_pack.run();
+
+      // send all messages
+      // _halo_exchange_openmp_workgroup_packing_end
+
+      // _halo_exchange_openmp_workgroup_unpacking_start
+      // recv all messages
+
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
+
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          pool_unpack.enqueue(range_segment(0, len), [=] (int i) {
+            var[list[i]] = buffer[i];
+          });
+
+          buffer += len;
+        }
+      }
+
+      workgroup group_unpack = pool_unpack.instantiate();
+
+      worksite site_unpack = group_unpack.run();
+      // _halo_exchange_openmp_workgroup_unpacking_end
+
+      }
+      timer.stop();
+
+      RAJA::Timer::ElapsedType tCycle = timer.elapsed();
+      if (tCycle < minCycle) minCycle = tCycle;
+      timer.reset();
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      memoryManager::deallocate(buffers[l]);
+
+    }
+
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+
+    // check results against reference copy
+    checkResult(vars, vars_ref, var_size, num_vars);
+    //printResult(vars, var_size, num_vars);
+  }
+
+#endif
+
+
+//----------------------------------------------------------------------------//
+
+
+#if defined(RAJA_ENABLE_CUDA)
+
+//----------------------------------------------------------------------------//
+// Separate packing/unpacking loops using forall
+//----------------------------------------------------------------------------//
+  {
+    std::cout << "\n Running RAJA Cuda forall halo exchange...\n";
+
+    double minCycle = std::numeric_limits<double>::max();
+
+
+    std::vector<double*> cuda_vars(num_vars, nullptr);
+    std::vector<int*>    cuda_pack_index_lists(num_neighbors, nullptr);
+    std::vector<int*>    cuda_unpack_index_lists(num_neighbors, nullptr);
+
+    for (int v = 0; v < num_vars; ++v) {
+      cuda_vars[v] = memoryManager::allocate_gpu<double>(var_size);
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+      int pack_len = pack_index_list_lengths[l];
+      cuda_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
+      cudaErrchk(cudaMemcpy( cuda_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), cudaMemcpyDefault ));
+
+      int unpack_len = unpack_index_list_lengths[l];
+      cuda_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
+      cudaErrchk(cudaMemcpy( cuda_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), cudaMemcpyDefault ));
+    }
+
+    std::swap(vars,               cuda_vars);
+    std::swap(pack_index_lists,   cuda_pack_index_lists);
+    std::swap(unpack_index_lists, cuda_unpack_index_lists);
+
+
+    // _halo_exchange_cuda_forall_policies_start
+    using forall_policy = RAJA::cuda_exec_async<CUDA_BLOCK_SIZE>;
+    // _halo_exchange_cuda_forall_policies_end
+
+    std::vector<double*> buffers(num_neighbors, nullptr);
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      int buffer_len = num_vars * pack_index_list_lengths[l];
+
+      buffers[l] = memoryManager::allocate_gpu<double>(buffer_len);
+
+    }
+
+    for (int c = 0; c < num_cycles; ++c ) {
+      timer.start();
+      {
+
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
+
+        double* var = vars[v];
+
+        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] RAJA_DEVICE (int i) {
+          var[i] = i + v;
+        });
+      }
+
+      // _halo_exchange_cuda_forall_packing_start
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
+
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          RAJA::forall<forall_policy>(range_segment(0, len), [=] RAJA_DEVICE (int i) {
+            buffer[i] = var[list[i]];
+          });
+
+          buffer += len;
+        }
+
+        cudaErrchk(cudaDeviceSynchronize());
+
+        // send single message
+      }
+      // _halo_exchange_cuda_forall_packing_end
+
+      // _halo_exchange_cuda_forall_unpacking_start
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        // recv single message
+
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
+
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          RAJA::forall<forall_policy>(range_segment(0, len), [=] RAJA_DEVICE (int i) {
+            var[list[i]] = buffer[i];
+          });
+
+          buffer += len;
+        }
+      }
+
+      cudaErrchk(cudaDeviceSynchronize());
+      // _halo_exchange_cuda_forall_unpacking_end
+
+      }
+      timer.stop();
+
+      RAJA::Timer::ElapsedType tCycle = timer.elapsed();
+      if (tCycle < minCycle) minCycle = tCycle;
+      timer.reset();
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      memoryManager::deallocate_gpu(buffers[l]);
+
+    }
+
+
+    std::swap(vars,               cuda_vars);
+    std::swap(pack_index_lists,   cuda_pack_index_lists);
+    std::swap(unpack_index_lists, cuda_unpack_index_lists);
+
+    for (int v = 0; v < num_vars; ++v) {
+      cudaErrchk(cudaMemcpy( vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault ));
+      memoryManager::deallocate_gpu(cuda_vars[v]);
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+      memoryManager::deallocate_gpu(cuda_pack_index_lists[l]);
+      memoryManager::deallocate_gpu(cuda_unpack_index_lists[l]);
+    }
+
+
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+
+    // check results against reference copy
+    checkResult(vars, vars_ref, var_size, num_vars);
+    //printResult(vars, var_size, num_vars);
+  }
+
+
+//----------------------------------------------------------------------------//
+// RAJA::WorkGroup with cuda_work allows deferred kernel fusion execution
+//----------------------------------------------------------------------------//
+  {
+    std::cout << "\n Running RAJA Cuda workgroup halo exchange...\n";
+
+    double minCycle = std::numeric_limits<double>::max();
+
+
+    std::vector<double*> cuda_vars(num_vars, nullptr);
+    std::vector<int*>    cuda_pack_index_lists(num_neighbors, nullptr);
+    std::vector<int*>    cuda_unpack_index_lists(num_neighbors, nullptr);
+
+    for (int v = 0; v < num_vars; ++v) {
+      cuda_vars[v] = memoryManager::allocate_gpu<double>(var_size);
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+      int pack_len = pack_index_list_lengths[l];
+      cuda_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
+      cudaErrchk(cudaMemcpy( cuda_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), cudaMemcpyDefault ));
+
+      int unpack_len = unpack_index_list_lengths[l];
+      cuda_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
+      cudaErrchk(cudaMemcpy( cuda_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), cudaMemcpyDefault ));
+    }
+
+    std::swap(vars,               cuda_vars);
+    std::swap(pack_index_lists,   cuda_pack_index_lists);
+    std::swap(unpack_index_lists, cuda_unpack_index_lists);
+
+
+    // _halo_exchange_cuda_workgroup_policies_start
+    using forall_policy = RAJA::cuda_exec_async<CUDA_BLOCK_SIZE>;
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::cuda_work_async<CUDA_WORKGROUP_BLOCK_SIZE>,
+                                 RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average,
+                                 RAJA::constant_stride_array_of_objects >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     pinned_allocator<char> >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       int,
+                                       RAJA::xargs<>,
+                                       pinned_allocator<char> >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     pinned_allocator<char> >;
+    // _halo_exchange_cuda_workgroup_policies_end
+
+    std::vector<double*> buffers(num_neighbors, nullptr);
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      int buffer_len = num_vars * pack_index_list_lengths[l];
+
+      buffers[l] = memoryManager::allocate_gpu<double>(buffer_len);
+
+    }
+
+    workpool pool_pack  (pinned_allocator<char>{});
+    workpool pool_unpack(pinned_allocator<char>{});
+
+    for (int c = 0; c < num_cycles; ++c ) {
+      timer.start();
+      {
+
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
+
+        double* var = vars[v];
+
+        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] RAJA_DEVICE (int i) {
+          var[i] = i + v;
+        });
+      }
+
+      // _halo_exchange_cuda_workgroup_packing_start
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
+
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) {
+            buffer[i] = var[list[i]];
+          });
+
+          buffer += len;
+        }
+      }
+
+      workgroup group_pack = pool_pack.instantiate();
+
+      worksite site_pack = group_pack.run();
+
+      cudaErrchk(cudaDeviceSynchronize());
+
+      // send all messages
+      // _halo_exchange_cuda_workgroup_packing_end
+
+      // _halo_exchange_cuda_workgroup_unpacking_start
+      // recv all messages
+
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
+
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) {
+            var[list[i]] = buffer[i];
+          });
+
+          buffer += len;
+        }
+      }
+
+      workgroup group_unpack = pool_unpack.instantiate();
+
+      worksite site_unpack = group_unpack.run();
+
+      cudaErrchk(cudaDeviceSynchronize());
+      // _halo_exchange_cuda_workgroup_unpacking_end
+
+      }
+      timer.stop();
+
+      RAJA::Timer::ElapsedType tCycle = timer.elapsed();
+      if (tCycle < minCycle) minCycle = tCycle;
+      timer.reset();
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      memoryManager::deallocate_gpu(buffers[l]);
+
+    }
+
+
+    std::swap(vars,               cuda_vars);
+    std::swap(pack_index_lists,   cuda_pack_index_lists);
+    std::swap(unpack_index_lists, cuda_unpack_index_lists);
+
+    for (int v = 0; v < num_vars; ++v) {
+      cudaErrchk(cudaMemcpy( vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault ));
+      memoryManager::deallocate_gpu(cuda_vars[v]);
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+      memoryManager::deallocate_gpu(cuda_pack_index_lists[l]);
+      memoryManager::deallocate_gpu(cuda_unpack_index_lists[l]);
+    }
+
+
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+
+    // check results against reference copy
+    checkResult(vars, vars_ref, var_size, num_vars);
+    //printResult(vars, var_size, num_vars);
+  }
+
+#endif
+
+
+//----------------------------------------------------------------------------//
+
+
+#if defined(RAJA_ENABLE_HIP)
+
+//----------------------------------------------------------------------------//
+// Separate packing/unpacking loops using forall
+//----------------------------------------------------------------------------//
+  {
+    std::cout << "\n Running RAJA Hip forall halo exchange...\n";
+
+    double minCycle = std::numeric_limits<double>::max();
+
+
+    std::vector<double*> hip_vars(num_vars, nullptr);
+    std::vector<int*>    hip_pack_index_lists(num_neighbors, nullptr);
+    std::vector<int*>    hip_unpack_index_lists(num_neighbors, nullptr);
+
+    for (int v = 0; v < num_vars; ++v) {
+      hip_vars[v] = memoryManager::allocate_gpu<double>(var_size);
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+      int pack_len = pack_index_list_lengths[l];
+      hip_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
+      hipErrchk(hipMemcpy( hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), hipMemcpyHostToDevice ));
+
+      int unpack_len = unpack_index_list_lengths[l];
+      hip_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
+      hipErrchk(hipMemcpy( hip_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), hipMemcpyHostToDevice ));
+    }
+
+    std::swap(vars,               hip_vars);
+    std::swap(pack_index_lists,   hip_pack_index_lists);
+    std::swap(unpack_index_lists, hip_unpack_index_lists);
+
+
+    // _halo_exchange_hip_forall_policies_start
+    using forall_policy = RAJA::hip_exec_async<HIP_BLOCK_SIZE>;
+    // _halo_exchange_hip_forall_policies_end
+
+    std::vector<double*> buffers(num_neighbors, nullptr);
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      int buffer_len = num_vars * pack_index_list_lengths[l];
+
+      buffers[l] = memoryManager::allocate_gpu<double>(buffer_len);
+
+    }
+
+    for (int c = 0; c < num_cycles; ++c ) {
+      timer.start();
+      {
+
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
+
+        double* var = vars[v];
+
+        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] RAJA_DEVICE (int i) {
+          var[i] = i + v;
+        });
+      }
+
+      // _halo_exchange_hip_forall_packing_start
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
+
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          RAJA::forall<forall_policy>(range_segment(0, len), [=] RAJA_DEVICE (int i) {
+            buffer[i] = var[list[i]];
+          });
+
+          buffer += len;
+        }
+
+        hipErrchk(hipDeviceSynchronize());
+
+        // send single message
+      }
+      // _halo_exchange_hip_forall_packing_end
+
+      // _halo_exchange_hip_forall_unpacking_start
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        // recv single message
+
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
+
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          RAJA::forall<forall_policy>(range_segment(0, len), [=] RAJA_DEVICE (int i) {
+            var[list[i]] = buffer[i];
+          });
+
+          buffer += len;
+        }
+      }
+
+      hipErrchk(hipDeviceSynchronize());
+      // _halo_exchange_hip_forall_unpacking_end
+
+      }
+      timer.stop();
+
+      RAJA::Timer::ElapsedType tCycle = timer.elapsed();
+      if (tCycle < minCycle) minCycle = tCycle;
+      timer.reset();
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      memoryManager::deallocate_gpu(buffers[l]);
+
+    }
+
+
+    std::swap(vars,               hip_vars);
+    std::swap(pack_index_lists,   hip_pack_index_lists);
+    std::swap(unpack_index_lists, hip_unpack_index_lists);
+
+    for (int v = 0; v < num_vars; ++v) {
+      hipErrchk(hipMemcpy( vars[v], hip_vars[v], var_size * sizeof(double), hipMemcpyDeviceToHost ));
+      memoryManager::deallocate_gpu(hip_vars[v]);
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+      memoryManager::deallocate_gpu(hip_pack_index_lists[l]);
+      memoryManager::deallocate_gpu(hip_unpack_index_lists[l]);
+    }
+
+
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+
+    // check results against reference copy
+    checkResult(vars, vars_ref, var_size, num_vars);
+    //printResult(vars, var_size, num_vars);
+  }
+
+
+//----------------------------------------------------------------------------//
+// RAJA::WorkGroup with hip_work allows deferred kernel fusion execution
+//----------------------------------------------------------------------------//
+  {
+    std::cout << "\n Running RAJA Hip workgroup halo exchange...\n";
+
+    double minCycle = std::numeric_limits<double>::max();
+
+
+    std::vector<double*> hip_vars(num_vars, nullptr);
+    std::vector<int*>    hip_pack_index_lists(num_neighbors, nullptr);
+    std::vector<int*>    hip_unpack_index_lists(num_neighbors, nullptr);
+
+    for (int v = 0; v < num_vars; ++v) {
+      hip_vars[v] = memoryManager::allocate_gpu<double>(var_size);
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+      int pack_len = pack_index_list_lengths[l];
+      hip_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
+      hipErrchk(hipMemcpy( hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), hipMemcpyHostToDevice ));
+
+      int unpack_len = unpack_index_list_lengths[l];
+      hip_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
+      hipErrchk(hipMemcpy( hip_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), hipMemcpyHostToDevice ));
+    }
+
+    std::swap(vars,               hip_vars);
+    std::swap(pack_index_lists,   hip_pack_index_lists);
+    std::swap(unpack_index_lists, hip_unpack_index_lists);
+
+
+    // _halo_exchange_hip_workgroup_policies_start
+    using forall_policy = RAJA::hip_exec_async<HIP_BLOCK_SIZE>;
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::hip_work_async<HIP_WORKGROUP_BLOCK_SIZE>,
+#if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
+                                 RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+#else
+                                 RAJA::ordered,
+#endif
+                                 RAJA::constant_stride_array_of_objects >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     pinned_allocator<char> >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       int,
+                                       RAJA::xargs<>,
+                                       pinned_allocator<char> >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     pinned_allocator<char> >;
+    // _halo_exchange_hip_workgroup_policies_end
+
+    std::vector<double*> buffers(num_neighbors, nullptr);
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      int buffer_len = num_vars * pack_index_list_lengths[l];
+
+      buffers[l] = memoryManager::allocate_gpu<double>(buffer_len);
+
+    }
+
+    workpool pool_pack  (pinned_allocator<char>{});
+    workpool pool_unpack(pinned_allocator<char>{});
+
+    for (int c = 0; c < num_cycles; ++c ) {
+      timer.start();
+      {
+
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
+
+        double* var = vars[v];
+
+        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] RAJA_DEVICE (int i) {
+          var[i] = i + v;
+        });
+      }
+
+      // _halo_exchange_hip_workgroup_packing_start
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
+
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) {
+            buffer[i] = var[list[i]];
+          });
+
+          buffer += len;
+        }
+      }
+
+      workgroup group_pack = pool_pack.instantiate();
+
+      worksite site_pack = group_pack.run();
+
+      hipErrchk(hipDeviceSynchronize());
+
+      // send all messages
+      // _halo_exchange_hip_workgroup_packing_end
+
+      // _halo_exchange_hip_workgroup_unpacking_start
+      // recv all messages
+
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
+
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) {
+            var[list[i]] = buffer[i];
+          });
+
+          buffer += len;
+        }
+      }
+
+      workgroup group_unpack = pool_unpack.instantiate();
+
+      worksite site_unpack = group_unpack.run();
+
+      hipErrchk(hipDeviceSynchronize());
+      // _halo_exchange_hip_workgroup_unpacking_end
+
+      }
+      timer.stop();
+
+      RAJA::Timer::ElapsedType tCycle = timer.elapsed();
+      if (tCycle < minCycle) minCycle = tCycle;
+      timer.reset();
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      memoryManager::deallocate_gpu(buffers[l]);
+
+    }
+
+
+    std::swap(vars,               hip_vars);
+    std::swap(pack_index_lists,   hip_pack_index_lists);
+    std::swap(unpack_index_lists, hip_unpack_index_lists);
+
+    for (int v = 0; v < num_vars; ++v) {
+      hipErrchk(hipMemcpy( vars[v], hip_vars[v], var_size * sizeof(double), hipMemcpyDeviceToHost ));
+      memoryManager::deallocate_gpu(hip_vars[v]);
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+      memoryManager::deallocate_gpu(hip_pack_index_lists[l]);
+      memoryManager::deallocate_gpu(hip_unpack_index_lists[l]);
+    }
+
+
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+
+    // check results against reference copy
+    checkResult(vars, vars_ref, var_size, num_vars);
+    //printResult(vars, var_size, num_vars);
+  }
+
+#endif
+
+
+//----------------------------------------------------------------------------//
+
+
+//
+// Clean up.
+//
+  for (int v = 0; v < num_vars; ++v) {
+    memoryManager::deallocate(vars[v]);
+    memoryManager::deallocate(vars_ref[v]);
+  }
+
+  destroy_pack_lists(pack_index_lists);
+  destroy_unpack_lists(unpack_index_lists);
+
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
+
+//
+// Function to compare result to reference and report P/F.
+//
+void checkResult(std::vector<double*> const& vars, std::vector<double*> const& vars_ref,
+                 int var_size, int num_vars)
+{
+  bool correct = true;
+  for (int v = 0; v < num_vars; ++v) {
+    double* var = vars[v];
+    double* var_ref = vars_ref[v];
+    for (int i = 0; i < var_size; i++) {
+      if ( var[i] != var_ref[i] ) { correct = false; }
+    }
+  }
+  if ( correct ) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+}
+
+//
+// Function to print result.
+//
+void printResult(std::vector<double*> const& vars, int var_size, int num_vars)
+{
+  std::cout << std::endl;
+  for (int v = 0; v < num_vars; ++v) {
+    double* var = vars[v];
+    for (int i = 0; i < var_size; i++) {
+      std::cout << "result[" << i << "] = " << var[i] << std::endl;
+    }
+  }
+  std::cout << std::endl;
+}
+
+
+struct Extent
+{
+  int i_min;
+  int i_max;
+  int j_min;
+  int j_max;
+  int k_min;
+  int k_max;
+};
+
+//
+// Function to generate index lists for packing.
+//
+void create_pack_lists(std::vector<int*>& pack_index_lists,
+                       std::vector<int >& pack_index_list_lengths,
+                       const int halo_width, const int* grid_dims)
+{
+  std::vector<Extent> pack_index_list_extents(num_neighbors);
+
+  // faces
+  pack_index_list_extents[0]  = Extent{halo_width  , halo_width   + halo_width,
+                                       halo_width  , grid_dims[1] + halo_width,
+                                       halo_width  , grid_dims[2] + halo_width};
+  pack_index_list_extents[1]  = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width  , grid_dims[1] + halo_width,
+                                       halo_width  , grid_dims[2] + halo_width};
+  pack_index_list_extents[2]  = Extent{halo_width  , grid_dims[0] + halo_width,
+                                       halo_width  , halo_width   + halo_width,
+                                       halo_width  , grid_dims[2] + halo_width};
+  pack_index_list_extents[3]  = Extent{halo_width  , grid_dims[0] + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       halo_width  , grid_dims[2] + halo_width};
+  pack_index_list_extents[4]  = Extent{halo_width  , grid_dims[0] + halo_width,
+                                       halo_width  , grid_dims[1] + halo_width,
+                                       halo_width  , halo_width   + halo_width};
+  pack_index_list_extents[5]  = Extent{halo_width  , grid_dims[0] + halo_width,
+                                       halo_width  , grid_dims[1] + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+
+  // edges
+  pack_index_list_extents[6]  = Extent{halo_width  , halo_width   + halo_width,
+                                       halo_width  , halo_width   + halo_width,
+                                       halo_width  , grid_dims[2] + halo_width};
+  pack_index_list_extents[7]  = Extent{halo_width  , halo_width   + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       halo_width  , grid_dims[2] + halo_width};
+  pack_index_list_extents[8]  = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width  , halo_width   + halo_width,
+                                       halo_width  , grid_dims[2] + halo_width};
+  pack_index_list_extents[9]  = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       halo_width  , grid_dims[2] + halo_width};
+  pack_index_list_extents[10] = Extent{halo_width  , halo_width   + halo_width,
+                                       halo_width  , grid_dims[1] + halo_width,
+                                       halo_width  , halo_width   + halo_width};
+  pack_index_list_extents[11] = Extent{halo_width  , halo_width   + halo_width,
+                                       halo_width  , grid_dims[1] + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[12] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width  , grid_dims[1] + halo_width,
+                                       halo_width  , halo_width   + halo_width};
+  pack_index_list_extents[13] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width  , grid_dims[1] + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[14] = Extent{halo_width  , grid_dims[0] + halo_width,
+                                       halo_width  , halo_width   + halo_width,
+                                       halo_width  , halo_width   + halo_width};
+  pack_index_list_extents[15] = Extent{halo_width  , grid_dims[0] + halo_width,
+                                       halo_width  , halo_width   + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[16] = Extent{halo_width  , grid_dims[0] + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       halo_width  , halo_width   + halo_width};
+  pack_index_list_extents[17] = Extent{halo_width  , grid_dims[0] + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+
+  // corners
+  pack_index_list_extents[18] = Extent{halo_width  , halo_width   + halo_width,
+                                       halo_width  , halo_width   + halo_width,
+                                       halo_width  , halo_width   + halo_width};
+  pack_index_list_extents[19] = Extent{halo_width  , halo_width   + halo_width,
+                                       halo_width  , halo_width   + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[20] = Extent{halo_width  , halo_width   + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       halo_width  , halo_width   + halo_width};
+  pack_index_list_extents[21] = Extent{halo_width  , halo_width   + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[22] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width  , halo_width   + halo_width,
+                                       halo_width  , halo_width   + halo_width};
+  pack_index_list_extents[23] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width  , halo_width   + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[24] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       halo_width  , halo_width   + halo_width};
+  pack_index_list_extents[25] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+
+  const int grid_i_stride = 1;
+  const int grid_j_stride = grid_dims[0] + 2*halo_width;
+  const int grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width);
+
+  for (int l = 0; l < num_neighbors; ++l) {
+
+    Extent extent = pack_index_list_extents[l];
+
+    pack_index_list_lengths[l] = (extent.i_max - extent.i_min) *
+                                 (extent.j_max - extent.j_min) *
+                                 (extent.k_max - extent.k_min) ;
+
+    pack_index_lists[l] = memoryManager::allocate<int>(pack_index_list_lengths[l]);
+
+    int* pack_list = pack_index_lists[l];
+
+    int list_idx = 0;
+    for (int kk = extent.k_min; kk < extent.k_max; ++kk) {
+      for (int jj = extent.j_min; jj < extent.j_max; ++jj) {
+        for (int ii = extent.i_min; ii < extent.i_max; ++ii) {
+
+          int pack_idx = ii * grid_i_stride +
+                         jj * grid_j_stride +
+                         kk * grid_k_stride ;
+
+          pack_list[list_idx] = pack_idx;
+
+          list_idx += 1;
+        }
+      }
+    }
+  }
+}
+
+//
+// Function to destroy packing index lists.
+//
+void destroy_pack_lists(std::vector<int*>& pack_index_lists)
+{
+  for (int l = 0; l < num_neighbors; ++l) {
+    memoryManager::deallocate(pack_index_lists[l]);
+  }
+}
+
+
+//
+// Function to generate index lists for unpacking.
+//
+void create_unpack_lists(std::vector<int*>& unpack_index_lists, std::vector<int>& unpack_index_list_lengths,
+                         const int halo_width, const int* grid_dims)
+{
+  std::vector<Extent> unpack_index_list_extents(num_neighbors);
+
+  // faces
+  unpack_index_list_extents[0]  = Extent{0                        ,                  halo_width,
+                                         halo_width               , grid_dims[1] +   halo_width,
+                                         halo_width               , grid_dims[2] +   halo_width};
+  unpack_index_list_extents[1]  = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
+                                         halo_width               , grid_dims[1] +   halo_width,
+                                         halo_width               , grid_dims[2] +   halo_width};
+  unpack_index_list_extents[2]  = Extent{halo_width               , grid_dims[0] +   halo_width,
+                                         0                        ,                  halo_width,
+                                         halo_width               , grid_dims[2] +   halo_width};
+  unpack_index_list_extents[3]  = Extent{halo_width               , grid_dims[0] +   halo_width,
+                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
+                                         halo_width               , grid_dims[2] +   halo_width};
+  unpack_index_list_extents[4]  = Extent{halo_width               , grid_dims[0] +   halo_width,
+                                         halo_width               , grid_dims[1] +   halo_width,
+                                         0                        ,                  halo_width};
+  unpack_index_list_extents[5]  = Extent{halo_width               , grid_dims[0] +   halo_width,
+                                         halo_width               , grid_dims[1] +   halo_width,
+                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
+
+  // edges
+  unpack_index_list_extents[6]  = Extent{0                        ,                  halo_width,
+                                         0                        ,                  halo_width,
+                                         halo_width               , grid_dims[2] +   halo_width};
+  unpack_index_list_extents[7]  = Extent{0                        ,                  halo_width,
+                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
+                                         halo_width               , grid_dims[2] +   halo_width};
+  unpack_index_list_extents[8]  = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
+                                         0                        ,                  halo_width,
+                                         halo_width               , grid_dims[2] +   halo_width};
+  unpack_index_list_extents[9]  = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
+                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
+                                         halo_width               , grid_dims[2] +   halo_width};
+  unpack_index_list_extents[10] = Extent{0                        ,                  halo_width,
+                                         halo_width               , grid_dims[1] +   halo_width,
+                                         0                        ,                  halo_width};
+  unpack_index_list_extents[11] = Extent{0                        ,                  halo_width,
+                                         halo_width               , grid_dims[1] +   halo_width,
+                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
+  unpack_index_list_extents[12] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
+                                         halo_width               , grid_dims[1] +   halo_width,
+                                         0                        ,                  halo_width};
+  unpack_index_list_extents[13] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
+                                         halo_width               , grid_dims[1] +   halo_width,
+                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
+  unpack_index_list_extents[14] = Extent{halo_width               , grid_dims[0] +   halo_width,
+                                         0                        ,                  halo_width,
+                                         0                        ,                  halo_width};
+  unpack_index_list_extents[15] = Extent{halo_width               , grid_dims[0] +   halo_width,
+                                         0                        ,                  halo_width,
+                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
+  unpack_index_list_extents[16] = Extent{halo_width               , grid_dims[0] +   halo_width,
+                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
+                                         0                        ,                  halo_width};
+  unpack_index_list_extents[17] = Extent{halo_width               , grid_dims[0] +   halo_width,
+                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
+                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
+
+  // corners
+  unpack_index_list_extents[18] = Extent{0                        ,                  halo_width,
+                                         0                        ,                  halo_width,
+                                         0                        ,                  halo_width};
+  unpack_index_list_extents[19] = Extent{0                        ,                  halo_width,
+                                         0                        ,                  halo_width,
+                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
+  unpack_index_list_extents[20] = Extent{0                        ,                  halo_width,
+                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
+                                         0                        ,                  halo_width};
+  unpack_index_list_extents[21] = Extent{0                        ,                  halo_width,
+                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
+                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
+  unpack_index_list_extents[22] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
+                                         0                        ,                  halo_width,
+                                         0                        ,                  halo_width};
+  unpack_index_list_extents[23] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
+                                         0                        ,                  halo_width,
+                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
+  unpack_index_list_extents[24] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
+                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
+                                         0                        ,                  halo_width};
+  unpack_index_list_extents[25] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
+                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
+                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
+
+  const int grid_i_stride = 1;
+  const int grid_j_stride = grid_dims[0] + 2*halo_width;
+  const int grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width);
+
+  for (int l = 0; l < num_neighbors; ++l) {
+
+    Extent extent = unpack_index_list_extents[l];
+
+    unpack_index_list_lengths[l] = (extent.i_max - extent.i_min) *
+                                   (extent.j_max - extent.j_min) *
+                                   (extent.k_max - extent.k_min) ;
+
+    unpack_index_lists[l] = memoryManager::allocate<int>(unpack_index_list_lengths[l]);
+
+    int* unpack_list = unpack_index_lists[l];
+
+    int list_idx = 0;
+    for (int kk = extent.k_min; kk < extent.k_max; ++kk) {
+      for (int jj = extent.j_min; jj < extent.j_max; ++jj) {
+        for (int ii = extent.i_min; ii < extent.i_max; ++ii) {
+
+          int unpack_idx = ii * grid_i_stride +
+                           jj * grid_j_stride +
+                           kk * grid_k_stride ;
+
+          unpack_list[list_idx] = unpack_idx;
+
+          list_idx += 1;
+        }
+      }
+    }
+  }
+}
+
+//
+// Function to destroy unpacking index lists.
+//
+void destroy_unpack_lists(std::vector<int*>& unpack_index_lists)
+{
+  for (int l = 0; l < num_neighbors; ++l) {
+    memoryManager::deallocate(unpack_index_lists[l]);
+  }
+}
diff --git a/examples/tut_indexset-segments.cpp b/examples/tut_indexset-segments.cpp
index 17a18c85fa..5750a8066e 100644
--- a/examples/tut_indexset-segments.cpp
+++ b/examples/tut_indexset-segments.cpp
@@ -15,6 +15,8 @@
 
 #include "RAJA/RAJA.hpp"
 
+#include "camp/resource.hpp"
+
 /*
  *  Index sets and Segments Example
  *
@@ -127,6 +129,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 //printResult(a, N);
 
 //----------------------------------------------------------------------------//
+// Resource object used to construct list segment objects with indices
+// living in host (CPU) memory.
+
+  camp::resources::Resource host_res{camp::resources::Host()};
+
+
 //
 // RAJA list segment version #1
 //
@@ -144,7 +152,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     idx.push_back(i); 
   } 
 
-  ListSegType idx_list( &idx[0], idx.size() );
+  ListSegType idx_list( &idx[0], idx.size(), host_res );
 
   RAJA::forall<RAJA::seq_exec>(idx_list, [=] (IdxType i) {
     a[i] += b[i] * c;
@@ -168,7 +176,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // _raja_list_segment_daxpy_reverse_start
   std::reverse( idx.begin(), idx.end() ); 
 
-  ListSegType idx_reverse_list( &idx[0], idx.size() );
+  ListSegType idx_reverse_list( &idx[0], idx.size(), host_res );
 
   RAJA::forall<RAJA::seq_exec>(idx_reverse_list, [=] (IdxType i) {
     a[i] += b[i] * c;
@@ -267,7 +275,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     idx1.push_back(i);
   }
 
-  ListSegType idx1_list( &idx1[0], idx1.size() );
+  ListSegType idx1_list( &idx1[0], idx1.size(), host_res );
 
   RAJA::TypedIndexSet<RAJA::RangeSegment, ListSegType> is3;
   is3.push_back( RAJA::RangeSegment(0, N/3) );
@@ -333,6 +341,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
+
+//
+// We create a new resource object and index set so that list segment 
+// indices live in CUDA deviec memory.
+//
+  camp::resources::Resource cuda_res{camp::resources::Cuda()};
+
+  ListSegType idx1_list_cuda( &idx1[0], idx1.size(), cuda_res );
+
+  RAJA::TypedIndexSet<RAJA::RangeSegment, ListSegType> is3_cuda;
+  is3_cuda.push_back( RAJA::RangeSegment(0, N/3) );
+  is3_cuda.push_back( idx1_list_cuda );
+  is3_cuda.push_back( RAJA::RangeSegment(2*N/3, N) );
+
+
   std::cout << 
     "\n Running RAJA index set (2 RangeSegments, 1 ListSegment) daxpy\n" << 
     " (sequential iteration over segments, CUDA parallel segment execution)...\n";
@@ -344,7 +367,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::memcpy( a, a0, N * sizeof(double) );
 
-  RAJA::forall<OMP_ISET_EXECPOL3>(is3, [=] RAJA_DEVICE (IdxType i) {
+  RAJA::forall<OMP_ISET_EXECPOL3>(is3_cuda, [=] RAJA_DEVICE (IdxType i) {
     a[i] += b[i] * c;
   });
 
@@ -355,6 +378,20 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
+
+//
+// We create a new resource object and index set so that list segment
+// indices live in Hip deviec memory.
+//
+  camp::resources::Resource hip_res{camp::resources::Hip()};
+
+  ListSegType idx1_list_hip( &idx1[0], idx1.size(), hip_res );
+
+  RAJA::TypedIndexSet<RAJA::RangeSegment, ListSegType> is3_hip;
+  is3_hip.push_back( RAJA::RangeSegment(0, N/3) );
+  is3_hip.push_back( idx1_list_hip );
+  is3_hip.push_back( RAJA::RangeSegment(2*N/3, N) );
+
   std::cout <<
     "\n Running RAJA index set (2 RangeSegments, 1 ListSegment) daxpy\n" <<
     " (sequential iteration over segments, HIP parallel segment execution)...\n";
@@ -368,7 +405,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   hipErrchk(hipMemcpy( d_a, a0, N * sizeof(double), hipMemcpyHostToDevice ));
   hipErrchk(hipMemcpy( d_b,  b, N * sizeof(double), hipMemcpyHostToDevice ));
 
-  RAJA::forall<OMP_ISET_EXECPOL3>(is3, [=] RAJA_DEVICE (IdxType i) {
+  RAJA::forall<OMP_ISET_EXECPOL3>(is3_hip, [=] RAJA_DEVICE (IdxType i) {
     d_a[i] += d_b[i] * c;
   });
 
diff --git a/examples/tut_matrix-multiply.cpp b/examples/tut_matrix-multiply.cpp
index ce9ceb750c..9154bd22df 100644
--- a/examples/tut_matrix-multiply.cpp
+++ b/examples/tut_matrix-multiply.cpp
@@ -472,8 +472,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL5 =
     RAJA::KernelPolicy<
       RAJA::statement::CudaKernel<
-        RAJA::statement::Tile<1, RAJA::statement::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::statement::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_loop,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_loop,
             RAJA::statement::For<1, RAJA::cuda_thread_y_loop,
               RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
                 RAJA::statement::Lambda<0>
@@ -575,8 +575,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL5 =
     RAJA::KernelPolicy<
       RAJA::statement::HipKernel<
-        RAJA::statement::Tile<1, RAJA::statement::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::statement::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_x_loop,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_x_loop,
             RAJA::statement::For<1, RAJA::hip_thread_y_loop,
               RAJA::statement::For<0, RAJA::hip_thread_x_loop,
                 RAJA::statement::Lambda<0>
@@ -634,11 +634,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     RAJA::KernelPolicy<
       RAJA::statement::For<1, RAJA::loop_exec,
         RAJA::statement::For<0, RAJA::loop_exec,
-          RAJA::statement::Lambda<0>,  // dot = 0.0
+          RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
           RAJA::statement::For<2, RAJA::loop_exec,
             RAJA::statement::Lambda<1> // inner loop: dot += ...
           >,
-          RAJA::statement::Lambda<2>   // set C(row, col) = dot
+          RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C(row, col) = dot
         >
       >
     >;
@@ -649,7 +649,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
 
     // lambda 0
-    [=] (int /* col */, int /* row */, int /* k */, double& dot) {
+    [=] (double& dot) {
        dot = 0.0;
     },
 
@@ -659,7 +659,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     },
 
     // lambda 2
-    [=] (int col, int row, int /* k */, double& dot) {
+    [=] (int col, int row, double& dot) {
        Cview(row, col) = dot;
     }
 
@@ -683,8 +683,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   // _matmult_3lambdakernel_args_seq_start
   // Alias for convenience
-  using RAJA::statement::Segs;
-  using RAJA::statement::Params;
+  using RAJA::Segs;
+  using RAJA::Params;
 
   using EXEC_POL6b =
     RAJA::KernelPolicy<
@@ -738,11 +738,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     RAJA::KernelPolicy<
       RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
                                 RAJA::ArgList<1, 0>,   // row, col
-        RAJA::statement::Lambda<0>,  // dot = 0.0
+        RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
         RAJA::statement::For<2, RAJA::loop_exec,
           RAJA::statement::Lambda<1> // inner loop: dot += ...
         >,
-        RAJA::statement::Lambda<2>   // set C(row, col) = dot
+        RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C(row, col) = dot
       >
     >;
   // _matmult_3lambdakernel_ompcollapse_end
@@ -753,7 +753,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
 
     // lambda 0
-    [=] (int /* col */, int /* row */, int /* k */, double& dot) {
+    [=] (double& dot) {
        dot = 0.0;
     },
 
@@ -763,7 +763,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     },
 
     // lambda 2
-    [=] (int col, int row, int /* k */, double& dot) {
+    [=] (int col, int row, double& dot) {
        Cview(row, col) = dot;
     }
 
@@ -787,11 +787,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       RAJA::statement::CudaKernel<
         RAJA::statement::For<1, RAJA::cuda_block_x_loop,    // row
           RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // col
-            RAJA::statement::Lambda<0>,   // dot = 0.0
+            RAJA::statement::Lambda<0, RAJA::Params<0>>,   // dot = 0.0
             RAJA::statement::For<2, RAJA::seq_exec,
                 RAJA::statement::Lambda<1> // dot += ...
             >,
-            RAJA::statement::Lambda<2>   // set C = ...
+            RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C = ...
           >
         >
       >
@@ -804,7 +804,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
 
     // lambda 0
-    [=] RAJA_DEVICE (int /* col */, int /* row */, int /* k */, double& dot) {
+    [=] RAJA_DEVICE (double& dot) {
        dot = 0.0;
     },
 
@@ -814,7 +814,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     },
 
     // lambda 2
-    [=] RAJA_DEVICE (int col, int row, int /* k */, double& dot) {
+    [=] RAJA_DEVICE (int col, int row, double& dot) {
        Cview(row, col) = dot;
     }
 
@@ -833,15 +833,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL9a =
     RAJA::KernelPolicy<
       RAJA::statement::CudaKernel<
-        RAJA::statement::Tile<1, RAJA::statement::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::statement::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_loop,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_loop,
             RAJA::statement::For<1, RAJA::cuda_thread_y_loop, // row
               RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // col
-                RAJA::statement::Lambda<0>,   // dot = 0.0
+                RAJA::statement::Lambda<0, RAJA::Params<0>>,   // dot = 0.0
                 RAJA::statement::For<2, RAJA::seq_exec,
                     RAJA::statement::Lambda<1> // dot += ...
                 >,
-                RAJA::statement::Lambda<2>   // set C = ...
+                RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C = ...
               >
             >
           >
@@ -856,7 +856,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
 
     // lambda 0
-    [=] RAJA_DEVICE (int /* col */, int /* row */, int /* k */, double& dot) {
+    [=] RAJA_DEVICE (double& dot) {
        dot = 0.0;
     },
 
@@ -866,7 +866,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     },
 
     // lambda 2
-    [=] RAJA_DEVICE (int col, int row, int /* k */, double& dot) {
+    [=] RAJA_DEVICE (int col, int row,  double& dot) {
        Cview(row, col) = dot;
     }
 
@@ -884,8 +884,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL9b =
     RAJA::KernelPolicy<
       RAJA::statement::CudaKernel<
-        RAJA::statement::Tile<1, RAJA::statement::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::statement::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_loop,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_loop,
             RAJA::statement::For<1, RAJA::cuda_thread_y_loop, // row
               RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // col
                 RAJA::statement::Lambda<0, Params<0>>,  // dot = 0.0
@@ -967,11 +967,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       RAJA::statement::HipKernel<
         RAJA::statement::For<1, RAJA::hip_block_x_loop,    // row
           RAJA::statement::For<0, RAJA::hip_thread_x_loop, // col
-            RAJA::statement::Lambda<0>,   // dot = 0.0
+            RAJA::statement::Lambda<0, RAJA::Params<0>>,   // dot = 0.0
             RAJA::statement::For<2, RAJA::seq_exec,
                 RAJA::statement::Lambda<1> // dot += ...
             >,
-            RAJA::statement::Lambda<2>   // set C = ...
+            RAJA::statement::Lambda<2, 
+              RAJA::Segs<0,1>,
+              RAJA::Params<0>>   // set C = ...
           >
         >
       >
@@ -983,7 +985,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
 
     // lambda 0
-    [=] RAJA_DEVICE (int /* col */, int /* row */, int /* k */, double& dot) {
+    [=] RAJA_DEVICE (double& dot) {
        dot = 0.0;
     },
 
@@ -993,7 +995,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     },
 
     // lambda 2
-    [=] RAJA_DEVICE (int col, int row, int /* k */, double& dot) {
+    [=] RAJA_DEVICE (int col, int row, double& dot) {
        d_Cview(row, col) = dot;
     }
 
@@ -1003,61 +1005,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   checkResult<double>(Cview, N);
 //printResult<double>(Cview, N);
 
-//----------------------------------------------------------------------------//
-
-  std::cout << "\n Running HIP mat-mult with multiple lambdas (RAJA-POL9a)...\n";
-
-  std::memset(C, 0, N*N * sizeof(double));
-  hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
-
-  using EXEC_POL9a =
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-        RAJA::statement::Tile<1, RAJA::statement::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::statement::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_x_loop,
-            RAJA::statement::For<1, RAJA::hip_thread_y_loop, // row
-              RAJA::statement::For<0, RAJA::hip_thread_x_loop, // col
-                RAJA::statement::Lambda<0>,   // dot = 0.0
-                RAJA::statement::For<2, RAJA::seq_exec,
-                    RAJA::statement::Lambda<1> // dot += ...
-                >,
-                RAJA::statement::Lambda<2>   // set C = ...
-              >
-            >
-          >
-        >
-      >
-    >;
-
-  RAJA::kernel_param<EXEC_POL9a>(
-    RAJA::make_tuple(col_range, row_range, dot_range),
-
-    RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
-
-    // lambda 0
-    [=] RAJA_DEVICE (int /* col */, int /* row */, int /* k */, double& dot) {
-       dot = 0.0;
-    },
-
-    // lambda 1
-    [=] RAJA_DEVICE (int col, int row, int k, double& dot) {
-       dot += d_Aview(row, k) * d_Bview(k, col);
-    },
-
-    // lambda 2
-    [=] RAJA_DEVICE (int col, int row, int /* k */, double& dot) {
-       d_Cview(row, col) = dot;
-    }
-
-  );
-
-  hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost ));
-  checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
 
   //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running HIP mat-mult with multiple lambdas - lambda args in statements (RAJA-POL9b)...\n";
+  std::cout << "\n Running HIP mat-mult with multiple lambdas - lambda args in statements (RAJA-POL9)...\n";
 
   std::memset(C, 0, N*N * sizeof(double));
   hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
@@ -1065,8 +1016,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL9b =
     RAJA::KernelPolicy<
       RAJA::statement::HipKernel<
-        RAJA::statement::Tile<1, RAJA::statement::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::statement::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_x_loop,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_x_loop,
             RAJA::statement::For<1, RAJA::hip_thread_y_loop, // row
               RAJA::statement::For<0, RAJA::hip_thread_x_loop, // col
                 RAJA::statement::Lambda<0, Params<0>>,  // dot = 0.0
diff --git a/examples/tut_matrix-transpose-local-array.cpp b/examples/tut_matrix-transpose-local-array.cpp
index 610cc2c6a6..fbb1f508d6 100644
--- a/examples/tut_matrix-transpose-local-array.cpp
+++ b/examples/tut_matrix-transpose-local-array.cpp
@@ -202,8 +202,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // _mattranspose_localarray_raja_start
   using SEQ_EXEC_POL_I =
     RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::loop_exec,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
 
           RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
 
@@ -248,15 +248,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   //Alias for convenience
-  using RAJA::statement::Segs;
-  using RAJA::statement::Offsets;
-  using RAJA::statement::Params;
+  using RAJA::Segs;
+  using RAJA::Offsets;
+  using RAJA::Params;
 
   // _mattranspose_localarray_raja_lambdaargs_start
   using SEQ_EXEC_POL_II =
     RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::loop_exec,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
 
           RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0>,
 
@@ -309,8 +309,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     //      These loops iterate over the number of
     //      tiles needed to carry out the transpose
     //
-    RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::omp_parallel_for_exec,
-      RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+    RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::omp_parallel_for_exec,
+      RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
         // This statement will initalize local array memory inside a
         // kernel. The cpu_tile_mem policy specifies that memory should be
         // allocated on the stack. The entries in the RAJA::ParamList
@@ -376,8 +376,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     //      These loops iterate over the number of
     //      tiles needed to carry out the transpose
     //
-    RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::loop_exec,
-      RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+    RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+      RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
       // This statement will initalize local array memory inside a
       // kernel. The cpu_tile_mem policy specifies that memory should be
       // allocated on the stack. The entries in the RAJA::ParamList
@@ -445,8 +445,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //      These loops iterate over the number of
       //      tiles needed to carry out the transpose
       //
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
           // This statement will initalize local array memory inside a
           // kernel. The cpu_tile_mem policy specifies that memory should be
           // allocated on the stack. The entries in the RAJA::ParamList
@@ -538,8 +538,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //      These loops iterate over the number of
       //      tiles needed to carry out the transpose
       //
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
           // This statement will initalize local array memory inside a
           // kernel. The cpu_tile_mem policy specifies that memory should be
           // allocated on the stack. The entries in the RAJA::ParamList
diff --git a/examples/tut_sort.cpp b/examples/tut_sort.cpp
new file mode 100644
index 0000000000..02171b56ec
--- /dev/null
+++ b/examples/tut_sort.cpp
@@ -0,0 +1,634 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <iostream>
+#include <algorithm>
+#include <numeric>
+#include <random>
+#include <map>
+#include <unordered_map>
+#include <set>
+#include <unordered_set>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ *  Sort Example
+ *
+ *  Example shows how to perform RAJA unstable and stable sort operations
+ *  for integer arrays, including pairs variant, using different comparators.
+ *  Other array data types, comparators, etc. are similar
+ *
+ *  RAJA features shown:
+ *    - `RAJA::sort` and `RAJA::sort_pairs` methods
+ *    - `RAJA::stable_sort` and `RAJA::stable_sort_pairs` methods
+ *    -  RAJA operators
+ *    -  Execution policies
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+/*
+  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block
+*/
+#if defined(RAJA_ENABLE_CUDA)
+const int CUDA_BLOCK_SIZE = 16;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+const int HIP_BLOCK_SIZE = 16;
+#endif
+
+//
+// Functions for checking results and printing vectors
+//
+template <typename Function, typename T>
+void checkUnstableSortResult(const T* in, const T* out, int N);
+template <typename Function, typename T, typename U>
+void checkUnstableSortResult(const T* in, const T* out,
+                             const U* in_vals, const U* out_vals, int N);
+//
+template <typename Function, typename T>
+void checkStableSortResult(const T* in, const T* out, int N);
+template <typename Function, typename T, typename U>
+void checkStableSortResult(const T* in, const T* out,
+                           const U* in_vals, const U* out_vals, int N);
+//
+template <typename T>
+void printArray(const T* k, int N);
+template <typename T, typename U>
+void printArray(const T* k, const U* v, int N);
+
+
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA sort example...\n";
+
+  // _sort_array_init_start
+//
+// Define array length
+//
+  const int N = 20;
+
+//
+// Allocate and initialize vector data
+//
+  int* in = memoryManager::allocate<int>(N);
+  int* out = memoryManager::allocate<int>(N);
+
+  unsigned* in_vals = memoryManager::allocate<unsigned>(N);
+  unsigned* out_vals = memoryManager::allocate<unsigned>(N);
+
+  std::iota(in      , in + N/2, 0);
+  std::iota(in + N/2, in + N  , 0);
+  std::shuffle(in      , in + N/2, std::mt19937{12345u});
+  std::shuffle(in + N/2, in + N  , std::mt19937{67890u});
+
+  std::fill(in_vals      , in_vals + N/2, 0);
+  std::fill(in_vals + N/2, in_vals + N  , 1);
+
+  // _sort_array_init_end
+
+  std::cout << "\n in keys...\n";
+  printArray(in, N);
+  std::cout << "\n in (key, value) pairs...\n";
+  printArray(in, in_vals, N);
+  std::cout << "\n";
+
+
+//----------------------------------------------------------------------------//
+// Perform various sequential sorts to illustrate unstable/stable,
+// pairs, default sorts with different comparators
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running sequential sort (default)...\n";
+
+  std::copy_n(in, N, out);
+
+  // _sort_seq_start
+  RAJA::sort<RAJA::seq_exec>(out, out + N);
+  // _sort_seq_end
+
+  checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  printArray(out, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running sequential sort (non-decreasing)...\n";
+
+  std::copy_n(in, N, out);
+
+  // _sort_seq_less_start
+  RAJA::sort<RAJA::seq_exec>(out, out + N,
+                             RAJA::operators::less<int>{});
+  // _sort_seq_less_end
+
+  checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  printArray(out, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running sequential stable_sort (non-decreasing)...\n";
+
+  std::copy_n(in, N, out);
+
+  // _sort_stable_seq_less_start
+  RAJA::stable_sort<RAJA::seq_exec>(out, out + N,
+                                    RAJA::operators::less<int>{});
+  // _sort_stable_seq_less_end
+
+  checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
+  printArray(out, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running sequential stable_sort (non-decreasing)...\n";
+
+  std::copy_n(in, N, out);
+
+  // _sort_stable_seq_greater_start
+  RAJA::stable_sort<RAJA::seq_exec>(out, out + N,
+                                    RAJA::operators::greater<int>{});
+  // _sort_stable_seq_greater_end
+
+  checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
+  printArray(out, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running sequential sort_pairs (non-decreasing)...\n";
+
+  std::copy_n(in, N, out);
+  std::copy_n(in_vals, N, out_vals);
+
+  // _sort_pairs_seq_less_start
+  RAJA::sort_pairs<RAJA::seq_exec>(out, out + N, out_vals,
+                                   RAJA::operators::less<int>{});
+  // _sort_pairs_seq_less_end
+
+  checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals, out_vals, N);
+  printArray(out, out_vals, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running sequential stable_sort_pairs (non-increasing)...\n";
+
+  std::copy_n(in, N, out);
+  std::copy_n(in_vals, N, out_vals);
+
+  // _sort_stable_pairs_seq_greater_start
+  RAJA::stable_sort_pairs<RAJA::seq_exec>(out, out + N, out_vals,
+                                          RAJA::operators::greater<int>{});
+  // _sort_stable_pairs_seq_greater_end
+
+  checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
+  printArray(out, out_vals, N);
+  std::cout << "\n";
+
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+//----------------------------------------------------------------------------//
+// Perform a couple of OpenMP sorts...
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running OpenMP sort (non-decreasing)...\n";
+
+  std::copy_n(in, N, out);
+
+  // _sort_omp_less_start
+  RAJA::sort<RAJA::omp_parallel_for_exec>(out, out + N,
+                                          RAJA::operators::less<int>{});
+  // _sort_omp_less_end
+
+  checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  printArray(out, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running OpenMP stable_sort_pairs (non-increasing)...\n";
+
+  std::copy_n(in, N, out);
+  std::copy_n(in_vals, N, out_vals);
+
+  // _sort_stable_pairs_omp_greater_start
+  RAJA::stable_sort_pairs<RAJA::omp_parallel_for_exec>(out, out + N, out_vals,
+                                                       RAJA::operators::greater<int>{});
+  // _sort_stable_pairs_omp_greater_end
+
+  checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
+  printArray(out, out_vals, N);
+  std::cout << "\n";
+
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+
+//----------------------------------------------------------------------------//
+// Perform a couple of CUDA sorts...
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running CUDA sort_pairs (non-increasing)...\n";
+
+  std::copy_n(in, N, out);
+  std::copy_n(in_vals, N, out_vals);
+
+  // _sort_pairs_cuda_greater_start
+  RAJA::sort_pairs<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(out, out + N, out_vals,
+                                       RAJA::operators::greater<int>{});
+  // _sort_pairs_cuda_greater_end
+
+  checkUnstableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
+  printArray(out, out_vals, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running CUDA stable_sort (non-decreasing)...\n";
+
+  std::copy_n(in, N, out);
+
+  // _sort_stable_cuda_less_start
+  RAJA::stable_sort<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(out, out + N,
+                                       RAJA::operators::less<int>{});
+  // _sort_stable_cuda_less_end
+
+  checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
+  printArray(out, N);
+  std::cout << "\n";
+
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+
+//----------------------------------------------------------------------------//
+// Perform a couple of HIP sorts...
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running HIP sort_pairs (non-decreasing)...\n";
+
+  std::copy_n(in, N, out);
+  std::copy_n(in_vals, N, out_vals);
+
+  int* d_out = memoryManager::allocate_gpu<int>(N);
+  int* d_out_vals = memoryManager::allocate_gpu<int>(N);
+
+  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice ));
+
+  RAJA::sort_pairs<RAJA::hip_exec<HIP_BLOCK_SIZE>>(d_out, d_out + N, d_out_vals,
+                                       RAJA::operators::less<int>{});
+
+  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy( out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost ));
+
+  checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals, out_vals, N);
+  printArray(out, out_vals, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running HIP stable_sort (non-increasing)...\n";
+
+  std::copy_n(in, N, out);
+
+  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
+
+  RAJA::stable_sort<RAJA::hip_exec<HIP_BLOCK_SIZE>>(d_out, d_out + N,
+                                       RAJA::operators::greater<int>{});
+
+  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
+
+  checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
+  printArray(out, N);
+  std::cout << "\n";
+
+  memoryManager::deallocate_gpu(d_out);
+  memoryManager::deallocate_gpu(d_out_vals);
+
+#endif
+
+//----------------------------------------------------------------------------//
+
+//
+// Clean up.
+//
+  memoryManager::deallocate(in);
+  memoryManager::deallocate(out);
+
+  memoryManager::deallocate(in_vals);
+  memoryManager::deallocate(out_vals);
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
+template <typename Comparator, typename T>
+bool equivalent(T const& a, T const& b, Comparator comp)
+{
+  return !comp(a, b) && !comp(b, a);
+}
+
+//
+// Function to check unstable sort result
+//
+template <typename Comparator, typename T>
+void checkUnstableSortResult(const T* in, const T* out, int N)
+{
+  Comparator comp;
+  bool correct = true;
+
+  // make map of keys to keys
+  using val_map = std::unordered_multiset<T>;
+  std::unordered_map<T, val_map> keys;
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    auto key_iter = keys.find(in[i]);
+    if (key_iter == keys.end()) {
+      auto ret = keys.emplace(in[i], val_map{});
+      assert(ret.second);
+      key_iter = ret.first;
+    }
+    key_iter->second.emplace(in[i]);
+  }
+
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    // test ordering
+    if (i > 0 && comp(out[i], out[i-1])) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << out[i-1] << ", " << out[i]
+                << " out of order"
+                << " (at index " << i-1 << ")\n";
+    }
+    // test there is an item with this
+    auto key_iter = keys.find(out[i]);
+    if (key_iter == keys.end()) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << out[i]
+                << " unknown or duplicate key"
+                << " (at index " << i << ")\n";
+    }
+    auto val_iter = key_iter->second.find(out[i]);
+    if (val_iter == key_iter->second.end()) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << out[i]
+                << " unknown or duplicate val"
+                << " (at index " << i << ")\n";
+    }
+    key_iter->second.erase(val_iter);
+    if (key_iter->second.size() == 0) {
+      keys.erase(key_iter);
+    }
+  }
+  if (correct) {
+    std::cout << "\n\t result -- CORRECT\n";
+  }
+}
+///
+template <typename Comparator, typename T, typename U>
+void checkUnstableSortResult(const T* in, const T* out,
+                             const U* in_vals, const U* out_vals, int N)
+{
+  Comparator comp;
+  bool correct = true;
+
+  // make map of keys to vals
+  using val_map = std::unordered_multiset<U>;
+  std::unordered_map<T, val_map> keys_to_vals;
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    auto key_iter = keys_to_vals.find(in[i]);
+    if (key_iter == keys_to_vals.end()) {
+      auto ret = keys_to_vals.emplace(in[i], val_map{});
+      assert(ret.second);
+      key_iter = ret.first;
+    }
+    key_iter->second.emplace(in_vals[i]);
+  }
+
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    // test ordering
+    if (i > 0 && comp(out[i], out[i-1])) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << "("  << out[i-1] << "," << out_vals[i-1] << "),"
+                << " (" << out[i]   << "," << out_vals[i]   << ")"
+                << " out of order"
+                << " (at index " << i-1 << ")\n";
+    }
+    // test there is a pair with this key and val
+    auto key_iter = keys_to_vals.find(out[i]);
+    if (key_iter == keys_to_vals.end()) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << "(" << out[i]   << "," << out_vals[i]   << ")"
+                << " unknown or duplicate key"
+                << " (at index " << i << ")\n";
+    }
+    auto val_iter = key_iter->second.find(out_vals[i]);
+    if (val_iter == key_iter->second.end()) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << "(" << out[i]   << "," << out_vals[i]   << ")"
+                << " unknown or duplicate val"
+                << " (at index " << i << ")\n";
+    }
+    key_iter->second.erase(val_iter);
+    if (key_iter->second.size() == 0) {
+      keys_to_vals.erase(key_iter);
+    }
+  }
+  if (correct) {
+    std::cout << "\n\t result -- CORRECT\n";
+  }
+}
+
+//
+// Function to check stable sort result
+//
+template <typename Comparator, typename T>
+void checkStableSortResult(const T* in, const T* out, int N)
+{
+  Comparator comp;
+  bool correct = true;
+
+  // make map of keys to keys
+  using val_map = std::list<T>;
+  std::unordered_map<T, val_map> keys;
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    auto key_iter = keys.find(in[i]);
+    if (key_iter == keys.end()) {
+      auto ret = keys.emplace(in[i], val_map{});
+      assert(ret.second);
+      key_iter = ret.first;
+    }
+    key_iter->second.emplace_back(in[i]);
+  }
+
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    // test ordering
+    if (i > 0 && comp(out[i], out[i-1])) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << out[i-1] << ", " << out[i]
+                << " out of order "
+                << " (at index " << i-1 << ")\n";
+    }
+    // test there is an item with this
+    auto key_iter = keys.find(out[i]);
+    if (key_iter == keys.end()) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << out[i]
+                << " unknown or duplicate key "
+                << " (at index " << i << ")\n";
+    }
+    if (key_iter->second.front() != out[i]) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << out[i]
+                << " out of stable order or unknown val "
+                << " (at index " << i << ")\n";
+    }
+    key_iter->second.pop_front();
+    if (key_iter->second.size() == 0) {
+      keys.erase(key_iter);
+    }
+  }
+  if (correct) {
+    std::cout << "\n\t result -- CORRECT\n";
+  }
+}
+///
+template <typename Comparator, typename T, typename U>
+void checkStableSortResult(const T* in, const T* out,
+                           const U* in_vals, const U* out_vals, int N)
+{
+  Comparator comp;
+  bool correct = true;
+
+  // make map of keys to vals
+  using val_map = std::list<U>;
+  std::unordered_map<T, val_map> keys_to_vals;
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    auto key_iter = keys_to_vals.find(in[i]);
+    if (key_iter == keys_to_vals.end()) {
+      auto ret = keys_to_vals.emplace(in[i], val_map{});
+      assert(ret.second);
+      key_iter = ret.first;
+    }
+    key_iter->second.emplace_back(in_vals[i]);
+  }
+
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    // test ordering
+    if (i > 0 && comp(out[i], out[i-1])) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << "("  << out[i-1] << "," << out_vals[i-1] << "),"
+                << " (" << out[i]   << "," << out_vals[i]   << ")"
+                << " out of order "
+                << " (at index " << i-1 << ")\n";
+    }
+    // test there is a pair with this key and val
+    auto key_iter = keys_to_vals.find(out[i]);
+    if (key_iter == keys_to_vals.end()) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << "(" << out[i]   << "," << out_vals[i]   << ")"
+                << " unknown or duplicate key "
+                << " (at index " << i << ")\n";
+    }
+    if (key_iter->second.front() != out_vals[i]) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << "(" << out[i]   << "," << out_vals[i]   << ")"
+                << " out of stable order or unknown val "
+                << " (at index " << i << ")\n";
+    }
+    key_iter->second.pop_front();
+    if (key_iter->second.size() == 0) {
+      keys_to_vals.erase(key_iter);
+    }
+  }
+  if (correct) {
+    std::cout << "\n\t result -- CORRECT\n";
+  }
+}
+
+
+//
+// Function to print vector.
+//
+template <typename T>
+void printArray(const T* k, int N)
+{
+  std::cout << std::endl;
+  for (int i = 0; i < N; ++i) { std::cout << " " << k[i]; }
+  std::cout << std::endl;
+}
+///
+template <typename T, typename U>
+void printArray(const T* k, const U* v, int N)
+{
+  std::cout << std::endl;
+  for (int i = 0; i < N; ++i) { std::cout << " (" << k[i] << "," << v[i] << ")"; }
+  std::cout << std::endl;
+}
+
diff --git a/examples/tut_tiled-matrix-transpose.cpp b/examples/tut_tiled-matrix-transpose.cpp
index 0ad0ebb85d..44c8fbc5b7 100644
--- a/examples/tut_tiled-matrix-transpose.cpp
+++ b/examples/tut_tiled-matrix-transpose.cpp
@@ -161,8 +161,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // _raja_tiled_mattranspose_start
   using KERNEL_EXEC_POL = 
     RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
           RAJA::statement::For<1, RAJA::seq_exec, 
             RAJA::statement::For<0, RAJA::seq_exec,
               RAJA::statement::Lambda<0>
@@ -193,8 +193,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   using KERNEL_EXEC_POL_OMP = 
     RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
           RAJA::statement::For<1, RAJA::omp_parallel_for_exec, 
             RAJA::statement::For<0, RAJA::loop_exec,
               RAJA::statement::Lambda<0>
@@ -227,8 +227,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   using KERNEL_EXEC_POL_OMP2 = 
     RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
                                     RAJA::ArgList<0, 1>,
                                     RAJA::statement::Lambda<0>
@@ -260,8 +260,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using KERNEL_EXEC_POL_CUDA = 
     RAJA::KernelPolicy<
       RAJA::statement::CudaKernel<
-        RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
             RAJA::statement::For<1, RAJA::cuda_thread_x_direct,
               RAJA::statement::For<0, RAJA::cuda_thread_y_direct,
                                       RAJA::statement::Lambda<0> 
@@ -302,8 +302,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using KERNEL_EXEC_POL_HIP =
     RAJA::KernelPolicy<
       RAJA::statement::HipKernel<
-        RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
             RAJA::statement::For<1, RAJA::hip_thread_x_direct,
               RAJA::statement::For<0, RAJA::hip_thread_y_direct,
                                       RAJA::statement::Lambda<0>
diff --git a/examples/tut_vertexsum-coloring.cpp b/examples/tut_vertexsum-coloring.cpp
index 3f4de68771..1612b2af0b 100644
--- a/examples/tut_vertexsum-coloring.cpp
+++ b/examples/tut_vertexsum-coloring.cpp
@@ -15,6 +15,8 @@
 
 #include "RAJA/RAJA.hpp"
 
+#include "camp/resource.hpp"
+
 /*
  *  Mesh Vertex Sum with Index Coloring Example
  *
@@ -232,10 +234,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   RAJA::TypedIndexSet<SegmentType> colorset;
 
-  colorset.push_back( SegmentType(&idx0[0], idx0.size()) ); 
-  colorset.push_back( SegmentType(&idx1[0], idx1.size()) ); 
-  colorset.push_back( SegmentType(&idx2[0], idx2.size()) ); 
-  colorset.push_back( SegmentType(&idx3[0], idx3.size()) ); 
+  camp::resources::Resource host_res{camp::resources::Host()};
+
+  colorset.push_back( SegmentType(&idx0[0], idx0.size(), host_res) ); 
+  colorset.push_back( SegmentType(&idx1[0], idx1.size(), host_res) ); 
+  colorset.push_back( SegmentType(&idx2[0], idx2.size(), host_res) ); 
+  colorset.push_back( SegmentType(&idx3[0], idx3.size(), host_res) ); 
   // _colorindexset_vertexsum_end
 
 //----------------------------------------------------------------------------//
@@ -307,13 +311,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 //
   std::cout << "\n Running RAJA CUDA index set version...\n";
 
+//
+// We create a RAJA TypedIndexSet with four ListSegments as before,
+// but now we use a CUDA resource so the segment indices live in
+// device memory.
+//
+  RAJA::TypedIndexSet<SegmentType> colorset_cuda;
+
+  camp::resources::Resource cuda_res{camp::resources::Cuda()};
+
+  colorset_cuda.push_back( SegmentType(&idx0[0], idx0.size(), cuda_res) );
+  colorset_cuda.push_back( SegmentType(&idx1[0], idx1.size(), cuda_res) );
+  colorset_cuda.push_back( SegmentType(&idx2[0], idx2.size(), cuda_res) );
+  colorset_cuda.push_back( SegmentType(&idx3[0], idx3.size(), cuda_res) );
+
   std::memset(vertexvol, 0, N_vert*N_vert * sizeof(double));
 
   // _raja_cuda_colorindexset_vertexsum_start
   using EXEC_POL4 = RAJA::ExecPolicy<RAJA::seq_segit, 
                                      RAJA::cuda_exec<CUDA_BLOCK_SIZE>>;
 
-  RAJA::forall<EXEC_POL4>(colorset, [=] RAJA_DEVICE (int ie) {
+  RAJA::forall<EXEC_POL4>(colorset_cuda, [=] RAJA_DEVICE (int ie) {
     int* iv = &(elem2vert_map[4*ie]);
     vertexvol[ iv[0] ] += elemvol[ie] / 4.0 ;
     vertexvol[ iv[1] ] += elemvol[ie] / 4.0 ;
@@ -347,10 +365,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::memset(vertexvol, 0, N_vert*N_vert * sizeof(double));
   hipMemcpy(d_vertexvol, vertexvol, N_vert*N_vert*sizeof(double), hipMemcpyHostToDevice);
 
+//
+// We create a RAJA TypedIndexSet with four ListSegments as before,
+// but now we use a Hip resource so the segment indices live in
+// device memory.
+//
+  RAJA::TypedIndexSet<SegmentType> colorset_hip;
+
+  camp::resources::Resource hip_res{camp::resources::Hip()};
+
+  colorset_hip.push_back( SegmentType(&idx0[0], idx0.size(), hip_res) );
+  colorset_hip.push_back( SegmentType(&idx1[0], idx1.size(), hip_res) );
+  colorset_hip.push_back( SegmentType(&idx2[0], idx2.size(), hip_res) );
+  colorset_hip.push_back( SegmentType(&idx3[0], idx3.size(), hip_res) );
+
   using EXEC_POL4 = RAJA::ExecPolicy<RAJA::seq_segit,
                                      RAJA::hip_exec<HIP_BLOCK_SIZE>>;
 
-  RAJA::forall<EXEC_POL4>(colorset, [=] RAJA_DEVICE (int ie) {
+  RAJA::forall<EXEC_POL4>(colorset_hip, [=] RAJA_DEVICE (int ie) {
     int* iv = &(d_elem2vert_map[4*ie]);
     d_vertexvol[ iv[0] ] += d_elemvol[ie] / 4.0 ;
     d_vertexvol[ iv[1] ] += d_elemvol[ie] / 4.0 ;
diff --git a/examples/wave-eqn.cpp b/examples/wave-eqn.cpp
index dcc651d048..f1345335c5 100644
--- a/examples/wave-eqn.cpp
+++ b/examples/wave-eqn.cpp
@@ -128,15 +128,25 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >;
 
   // OpenMP policy
-  // using fdPolicy = RAJA::KernelPolicy<
-  // RAJA::statement::For<0, RAJA::omp_parallel_for_exec >,
-  // RAJA::statement::For<1, RAJA::seq_exec > >;
+  //using fdPolicy = RAJA::KernelPolicy<
+  //RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
+  //  RAJA::statement::For<0, RAJA::loop_exec, RAJA::statement::Lambda<0> > > >;
 
   // CUDA policy
-  // using fdPolicy = RAJA::KernelPolicy<
-  // RAJA::statement::CudaCollapse<
-  // RAJA::statement::For<0, RAJA::cuda_threadblock_x_exec<16> >,
-  // RAJA::statement::For<1, RAJA::cuda_threadblock_y_exec<16> > > >;
+  //using fdPolicy =
+  //RAJA::KernelPolicy<
+  //  RAJA::statement::CudaKernel<
+  //      RAJA::statement::Tile<1, RAJA::tile_fixed<16>, RAJA::cuda_block_y_direct,
+  //        RAJA::statement::Tile<0, RAJA::tile_fixed<16>, RAJA::cuda_block_x_direct,
+  //          RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
+  //            RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
+  //              RAJA::statement::Lambda<0>
+  //            >
+  //          >
+  //        >
+  //      >
+  //    >
+  //  >;
 
 
   time = 0;
@@ -182,8 +192,8 @@ void computeErr(double *P, double tf, grid_s grid)
   RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
 
   using initialPolicy = RAJA::KernelPolicy<
-  RAJA::statement::For<1, RAJA::loop_exec >,
-    RAJA::statement::For<0, RAJA::loop_exec, RAJA::statement::Lambda<0>> >;
+  RAJA::statement::For<1, RAJA::loop_exec ,
+    RAJA::statement::For<0, RAJA::loop_exec, RAJA::statement::Lambda<0> > > >;
 
   RAJA::kernel<initialPolicy>(RAJA::make_tuple(fdBounds,fdBounds),
                        [=] (RAJA::Index_type tx, RAJA::Index_type ty) {
@@ -213,8 +223,8 @@ void setIC(double *P1, double *P2, double t0, double t1, grid_s grid)
   RAJA::RangeSegment fdBounds(0, grid.nx);
 
   using initialPolicy = RAJA::KernelPolicy<
-  RAJA::statement::For<1, RAJA::loop_exec >,
-    RAJA::statement::For<0, RAJA::loop_exec, RAJA::statement::Lambda<0>> >;
+  RAJA::statement::For<1, RAJA::loop_exec,
+    RAJA::statement::For<0, RAJA::loop_exec, RAJA::statement::Lambda<0>> > >;
   
   RAJA::kernel<initialPolicy>(RAJA::make_tuple(fdBounds,fdBounds),
                        [=] (RAJA::Index_type tx, RAJA::Index_type ty) {
diff --git a/exercises/tutorial_halfday/ex3_colored-indexset.cpp b/exercises/tutorial_halfday/ex3_colored-indexset.cpp
index 0f3fdcbf85..0d370fcfe9 100644
--- a/exercises/tutorial_halfday/ex3_colored-indexset.cpp
+++ b/exercises/tutorial_halfday/ex3_colored-indexset.cpp
@@ -13,6 +13,8 @@
 
 #include "RAJA/RAJA.hpp"
 
+#include "camp/resource.hpp"
+
 #include "memoryManager.hpp"
 
 /*
@@ -213,23 +215,33 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
- 
-// 
-// Create a RAJA TypedIndexSet with four ListSegments, one for the indices of 
-// the elements in each subset. This will be used in the RAJA OpenMP and CUDA 
-// variants of the vertex sum calculation.
-//
+
 // The TypedIndexSet is a variadic template, where the template arguments
-// are the segment types that the TypedIndexSet can hold. 
-// 
+// are the segment types that the TypedIndexSet can hold.
+//
+  using SegmentType = RAJA::TypedListSegment<int>;
 
+#if defined(RAJA_ENABLE_OPENMP)
+
+//
+// Resource object used to construct list segment objects with indices
+// living in host (CPU) memory.
+//
+  camp::resources::Resource host_res{camp::resources::Host()};
+
+//
+// Create a RAJA TypedIndexSet with four ListSegments, one for the indices of
+// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
+// variants of the vertex sum calculation.
+
+  RAJA::TypedIndexSet<SegmentType> colorset;
 
   ///
   /// TODO...
   ///
-  /// EXERCISE: Create a RAJA::TypedIndexSet object that holds four
-  ///           RAJA::TypedListSegment objects, one for each of the 
-  ///           'idx' arrays above.
+  /// EXERCISE: Add four SegmentType objects to the coloret, one for each of 
+  ///           the 'idx' arrays above. Remember to pass the 'host_res'
+  ///           object to the SegmentType constructor. 
   ///
 
 
@@ -238,8 +250,6 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 // over segments, OpenMP parallel iteration of each segment)
 //----------------------------------------------------------------------------//
 
-#if defined(RAJA_ENABLE_OPENMP)
-
   std::cout << "\n Running RAJA OpenMP index set vertex sum...\n";
 
   std::memset(areav, 0, Nvert*Nvert * sizeof(double));
@@ -272,6 +282,36 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_CUDA)
 
+//
+// Resource object used to construct list segment objects with indices
+// living in host (CPU) memory.
+//
+  camp::resources::Resource cuda_res{camp::resources::Cuda()};
+
+  RAJA::TypedIndexSet<SegmentType> cuda_colorset;
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Add four SegmentType objects to the cuda_coloret, one for 
+  ///           each of the 'idx' arrays above. Remember to pass the 'cuda_res'
+  ///           object to the SegmentType constructor.
+  ///
+
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement the vertex sum kernel a RAJA::forall
+  ///           method with execution policy type
+  ///
+  ///            RAJA::ExecPolicy<RAJA::seq_segit,
+  ///                             RAJA::cuda_exec<CUDA_BLOCK_SIZE>>
+  ///
+  ///            so that the kernel iterates over the segments sequentially
+  ///            and executes each segment in parallel as a CUDA kernel.
+
+
   std::cout << "\n Running RAJA CUDA index set vertex sum...\n";
 
   std::memset(areav, 0, Nvert*Nvert * sizeof(double));
diff --git a/exercises/tutorial_halfday/ex3_colored-indexset_solution.cpp b/exercises/tutorial_halfday/ex3_colored-indexset_solution.cpp
index ffb476d521..1e3d364b58 100644
--- a/exercises/tutorial_halfday/ex3_colored-indexset_solution.cpp
+++ b/exercises/tutorial_halfday/ex3_colored-indexset_solution.cpp
@@ -13,6 +13,8 @@
 
 #include "RAJA/RAJA.hpp"
 
+#include "camp/resource.hpp"
+
 #include "memoryManager.hpp"
 
 /*
@@ -210,31 +212,37 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
+
+// The TypedIndexSet is a variadic template, where the template arguments
+// are the segment types that the TypedIndexSet can hold. 
+// 
+  using SegmentType = RAJA::TypedListSegment<int>;
  
+#if defined(RAJA_ENABLE_OPENMP)
+
+//
+// Resource object used to construct list segment objects with indices
+// living in host (CPU) memory.
+//
+  camp::resources::Resource host_res{camp::resources::Host()};
+
 // 
 // Create a RAJA TypedIndexSet with four ListSegments, one for the indices of 
 // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA 
 // variants of the vertex sum calculation.
-//
-// The TypedIndexSet is a variadic template, where the template arguments
-// are the segment types that the TypedIndexSet can hold. 
-// 
-  using SegmentType = RAJA::TypedListSegment<int>;
 
   RAJA::TypedIndexSet<SegmentType> colorset;
 
-  colorset.push_back( SegmentType(&idx[0][0], idx[0].size()) ); 
-  colorset.push_back( SegmentType(&idx[1][0], idx[1].size()) ); 
-  colorset.push_back( SegmentType(&idx[2][0], idx[2].size()) ); 
-  colorset.push_back( SegmentType(&idx[3][0], idx[3].size()) ); 
+  colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), host_res) ); 
+  colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), host_res) ); 
+  colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), host_res) ); 
+  colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), host_res) ); 
 
 //----------------------------------------------------------------------------//
 // RAJA OpenMP vertex sum calculation using TypedIndexSet (sequential iteration 
 // over segments, OpenMP parallel iteration of each segment)
 //----------------------------------------------------------------------------//
 
-#if defined(RAJA_ENABLE_OPENMP)
-
   std::cout << "\n Running RAJA OpenMP index set vertex sum...\n";
 
   std::memset(areav, 0, Nvert*Nvert * sizeof(double));
@@ -264,6 +272,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_CUDA)
 
+//
+// Resource object used to construct list segment objects with indices
+// living in host (CPU) memory.
+//
+  camp::resources::Resource cuda_res{camp::resources::Cuda()};
+
+//
+// Create a RAJA TypedIndexSet with four ListSegments, one for the indices of
+// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
+// variants of the vertex sum calculation.
+
+  RAJA::TypedIndexSet<SegmentType> cuda_colorset;
+
+  cuda_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), cuda_res) );
+  cuda_colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), cuda_res) );
+  cuda_colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), cuda_res) );
+  cuda_colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), cuda_res) );
+
   std::cout << "\n Running RAJA CUDA index set vertex sum...\n";
 
   std::memset(areav, 0, Nvert*Nvert * sizeof(double));
@@ -271,7 +297,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL4 = RAJA::ExecPolicy<RAJA::seq_segit, 
                                      RAJA::cuda_exec<CUDA_BLOCK_SIZE>>;
 
-  RAJA::forall<EXEC_POL4>(colorset, [=] RAJA_DEVICE (int ie) {
+  RAJA::forall<EXEC_POL4>(cuda_colorset, [=] RAJA_DEVICE (int ie) {
     int* iv = &(e2v_map[4*ie]);
     areav[ iv[0] ] += areae[ie] / 4.0 ;
     areav[ iv[1] ] += areae[ie] / 4.0 ;
diff --git a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp
index 51edcbaa5f..f3026c9f61 100644
--- a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp
+++ b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp
@@ -166,9 +166,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 #if 0
   using KERNEL_EXEC_POL_SEQ =
     RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_SZ>, 
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
                                RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_SZ>, 
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
                                  RAJA::seq_exec,
           // Fill in inner loop execution statements.... 
               RAJA::statement::Lambda<0>
@@ -205,9 +205,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 #if 0
   using KERNEL_EXEC_POL_OMP =
     RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_SZ>, 
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
                                RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_SZ>, 
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
                                  RAJA::seq_exec,
           // Fill in inner loop execution statements.... 
               RAJA::statement::Lambda<0>
@@ -246,9 +246,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   using KERNEL_EXEC_POL_OMP2 =
     RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_SZ>, 
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
                                RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_SZ>, 
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
                                  RAJA::seq_exec,
           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
                                     RAJA::ArgList<0, 1>,
@@ -280,9 +280,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using KERNEL_EXEC_POL_CUDA =
     RAJA::KernelPolicy<
       RAJA::statement::CudaKernel<
-        RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_SZ>, 
+        RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
                                  RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_SZ>, 
+          RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
                                    RAJA::cuda_block_x_loop,
             // Fill in inner loop execution statements.... 
                 RAJA::statement::Lambda<0>
diff --git a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp
index f26ea27775..c6495ee98a 100644
--- a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp
+++ b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp
@@ -164,9 +164,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   using KERNEL_EXEC_POL_SEQ =
     RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_SZ>, 
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
                                RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_SZ>, 
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
                                  RAJA::seq_exec,
           RAJA::statement::For<1, RAJA::seq_exec,
             RAJA::statement::For<0, RAJA::seq_exec,
@@ -198,9 +198,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   using KERNEL_EXEC_POL_OMP =
     RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_SZ>, 
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
                                RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_SZ>, 
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
                                  RAJA::seq_exec,
           RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
             RAJA::statement::For<0, RAJA::seq_exec,
@@ -236,9 +236,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   using KERNEL_EXEC_POL_OMP2 =
     RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_SZ>, 
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
                                RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_SZ>, 
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
                                  RAJA::seq_exec,
           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
                                     RAJA::ArgList<0, 1>,
@@ -269,9 +269,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using KERNEL_EXEC_POL_CUDA =
     RAJA::KernelPolicy<
       RAJA::statement::CudaKernel<
-        RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_SZ>, 
+        RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
                                  RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_SZ>, 
+          RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
                                    RAJA::cuda_block_x_loop,
             RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
               RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
diff --git a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp
index 884c207bcd..1b9f5ccd1a 100644
--- a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp
+++ b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp
@@ -117,8 +117,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by) {
-    for (int bx = 0; bx < outer_Dimc; ++bx) {
+  for (int brow = 0; brow < outer_Dimr; ++brow) {
+    for (int bcol = 0; bcol < outer_Dimc; ++bcol) {
 
       // Stack-allocated local array for data on a tile
       int Tile[TILE_SZ][TILE_SZ];
@@ -132,8 +132,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       for (int trow = 0; trow < TILE_SZ; ++trow) {
         for (int tcol = 0; tcol < TILE_SZ; ++tcol) {
 
-          int col = bx * TILE_SZ + tcol;  // Matrix column index
-          int row = by * TILE_SZ + trow;  // Matrix row index
+          int col = bcol * TILE_SZ + tcol;  // Matrix column index
+          int row = brow * TILE_SZ + trow;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c) {
@@ -151,8 +151,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       for (int tcol = 0; tcol < TILE_SZ; ++tcol) {
         for (int trow = 0; trow < TILE_SZ; ++trow) {
 
-          int col = bx * TILE_SZ + tcol;  // Matrix column index
-          int row = by * TILE_SZ + trow;  // Matrix row index
+          int col = bcol * TILE_SZ + tcol;  // Matrix column index
+          int row = brow * TILE_SZ + trow;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c) {
@@ -209,17 +209,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
           RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
 
-          RAJA::statement::ForICount<1, RAJA::statement::Param<0>, 
+          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
                                         RAJA::loop_exec,
-            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, 
+            RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
                                         RAJA::loop_exec,
               RAJA::statement::Lambda<0>
             >
           >,
 
-          RAJA::statement::ForICount<0, RAJA::statement::Param<1>, 
+          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
                                         RAJA::loop_exec,
-            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, 
+            RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
                                           RAJA::loop_exec,
               RAJA::statement::Lambda<1>
             >
@@ -247,7 +247,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
     RAJA::make_tuple((int)0, (int)0, RAJA_Tile),
 
-    [=](int col, int row, int tcol, int trow, TILE_MEM &RAJA_Tile) {
+    [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) {
 
       RAJA_Tile(trow, tcol) = Aview(row, col);
 
@@ -276,17 +276,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
         RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
 
-          RAJA::statement::ForICount<1, RAJA::statement::Param<0>, 
+          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
                                         RAJA::loop_exec,
-            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, 
+            RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
                                           RAJA::loop_exec,
                RAJA::statement::Lambda<0>
             >
           >,
 
-          RAJA::statement::ForICount<0, RAJA::statement::Param<1>, 
+          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
                                         RAJA::loop_exec,
-            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, 
+            RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
                                           RAJA::loop_exec,
               RAJA::statement::Lambda<1>
             >
@@ -315,7 +315,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     // Fill in lambda expression to write input matrix entry
     // to local tile array.
 
-    [=](int col, int row, int tcol, int trow, TILE_MEM &RAJA_Tile) {
+    [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) {
 
       Atview(col, row) = RAJA_Tile(trow, tcol);
 
@@ -345,9 +345,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
           RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<2>,
 
-            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, 
+            RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
                                           RAJA::cuda_thread_y_direct,
-              RAJA::statement::ForICount<0, RAJA::statement::Param<1>, 
+              RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
                                             RAJA::cuda_thread_x_direct,
                 RAJA::statement::Lambda<0>
               >
@@ -355,9 +355,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
             RAJA::statement::CudaSyncThreads,
 
-            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, 
+            RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
                                           RAJA::cuda_thread_y_direct,
-              RAJA::statement::ForICount<1, RAJA::statement::Param<0>, 
+              RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
                                             RAJA::cuda_thread_x_direct,
                 RAJA::statement::Lambda<1>
               >
@@ -386,7 +386,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
     RAJA::make_tuple((int)0, (int)0, RAJA_Tile),
 
-    [=] RAJA_DEVICE(int col, int row, int tcol, int trow, TILE_MEM &RAJA_Tile) {
+    [=] RAJA_DEVICE(int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) {
 
       RAJA_Tile(trow, tcol) = Aview(row, col);
 
diff --git a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp
index fac8506606..2f2c1733d7 100644
--- a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp
+++ b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp
@@ -117,8 +117,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by) {
-    for (int bx = 0; bx < outer_Dimc; ++bx) {
+  for (int brow = 0; brow < outer_Dimr; ++brow) {
+    for (int bcol = 0; bcol < outer_Dimc; ++bcol) {
 
       // Stack-allocated local array for data on a tile
       int Tile[TILE_SZ][TILE_SZ];
@@ -132,8 +132,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       for (int trow = 0; trow < TILE_SZ; ++trow) {
         for (int tcol = 0; tcol < TILE_SZ; ++tcol) {
 
-          int col = bx * TILE_SZ + tcol;  // Matrix column index
-          int row = by * TILE_SZ + trow;  // Matrix row index
+          int col = bcol * TILE_SZ + tcol;  // Matrix column index
+          int row = brow * TILE_SZ + trow;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c) {
@@ -151,8 +151,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       for (int tcol = 0; tcol < TILE_SZ; ++tcol) {
         for (int trow = 0; trow < TILE_SZ; ++trow) {
 
-          int col = bx * TILE_SZ + tcol;  // Matrix column index
-          int row = by * TILE_SZ + trow;  // Matrix row index
+          int col = bcol * TILE_SZ + tcol;  // Matrix column index
+          int row = brow * TILE_SZ + trow;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c) {
@@ -203,24 +203,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   using SEQ_EXEC_POL =
     RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_SZ>, 
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
                                RAJA::loop_exec,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_SZ>, 
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
                                  RAJA::loop_exec,
 
           RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
 
-          RAJA::statement::ForICount<1, RAJA::statement::Param<0>, 
+          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
                                         RAJA::loop_exec,
-            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, 
+            RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
                                         RAJA::loop_exec,
               RAJA::statement::Lambda<0>
             >
           >,
 
-          RAJA::statement::ForICount<0, RAJA::statement::Param<1>, 
+          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
                                         RAJA::loop_exec,
-            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, 
+            RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
                                           RAJA::loop_exec,
               RAJA::statement::Lambda<1>
             >
@@ -235,13 +235,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
     RAJA::make_tuple((int)0, (int)0, RAJA_Tile),
 
-    [=](int col, int row, int tcol, int trow, TILE_MEM &RAJA_Tile) {
+    [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) {
 
       RAJA_Tile(trow, tcol) = Aview(row, col);
 
     },
 
-    [=](int col, int row, int tcol, int trow, TILE_MEM &RAJA_Tile) {
+    [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) {
 
       Atview(col, row) = RAJA_Tile(trow, tcol);
 
@@ -259,24 +259,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   using OPENMP_EXEC_POL =
   RAJA::KernelPolicy<
-    RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_SZ>, 
+    RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
                              RAJA::omp_parallel_for_exec,
-      RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_SZ>, 
+      RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
                                RAJA::loop_exec,
 
         RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
 
-          RAJA::statement::ForICount<1, RAJA::statement::Param<0>, 
+          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
                                         RAJA::loop_exec,
-            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, 
+            RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
                                           RAJA::loop_exec,
                RAJA::statement::Lambda<0>
             >
           >,
 
-          RAJA::statement::ForICount<0, RAJA::statement::Param<1>, 
+          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
                                         RAJA::loop_exec,
-            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, 
+            RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
                                           RAJA::loop_exec,
               RAJA::statement::Lambda<1>
             >
@@ -290,13 +290,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
       RAJA::make_tuple((int)0, (int)0, RAJA_Tile),
 
-      [=](int col, int row, int tcol, int trow, TILE_MEM &RAJA_Tile) {
+      [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) {
 
         RAJA_Tile(trow, tcol) = Aview(row, col);
 
       },
 
-      [=](int col, int row, int tcol, int trow, TILE_MEM &RAJA_Tile) {
+      [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) {
 
         Atview(col, row) = RAJA_Tile(trow, tcol);
 
@@ -317,16 +317,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using CUDA_EXEC_POL =
   RAJA::KernelPolicy<
     RAJA::statement::CudaKernel<
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_SZ>, 
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
                                RAJA::cuda_block_y_loop,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_SZ>, 
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
                                  RAJA::cuda_block_x_loop,
 
           RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<2>,
 
-            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, 
+            RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
                                           RAJA::cuda_thread_y_direct,
-              RAJA::statement::ForICount<0, RAJA::statement::Param<1>, 
+              RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
                                             RAJA::cuda_thread_x_direct,
                 RAJA::statement::Lambda<0>
               >
@@ -334,9 +334,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
             RAJA::statement::CudaSyncThreads,
 
-            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, 
+            RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
                                           RAJA::cuda_thread_y_direct,
-              RAJA::statement::ForICount<1, RAJA::statement::Param<0>, 
+              RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
                                             RAJA::cuda_thread_x_direct,
                 RAJA::statement::Lambda<1>
               >
@@ -354,13 +354,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
       RAJA::make_tuple((int)0, (int)0, RAJA_Tile),
 
-      [=] RAJA_DEVICE (int col, int row, int tcol, int trow, TILE_MEM &RAJA_Tile) {
+      [=] RAJA_DEVICE (int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) {
 
         RAJA_Tile(trow, tcol) = Aview(row, col);
 
       },
 
-      [=] RAJA_DEVICE(int col, int row, int tcol, int trow, TILE_MEM &RAJA_Tile) {
+      [=] RAJA_DEVICE(int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) {
 
         Atview(col, row) = RAJA_Tile(trow, tcol);
 
diff --git a/host-configs/lassen-blueos_3_ppc64le_ib_p9-clang@8.0.1-cuda.cmake b/host-configs/lassen-blueos_3_ppc64le_ib_p9-clang@8.0.1-cuda.cmake
new file mode 100644
index 0000000000..dcb5a731d4
--- /dev/null
+++ b/host-configs/lassen-blueos_3_ppc64le_ib_p9-clang@8.0.1-cuda.cmake
@@ -0,0 +1,58 @@
+###################
+# Generated host-config - Edit at own risk!
+###################
+# Copyright (c) 2020, Lawrence Livermore National Security, LLC and
+# other Umpire Project Developers. See the top-level LICENSE file for
+# details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause) 
+###################
+
+#------------------
+# SYS_TYPE: blueos_3_ppc64le_ib_p9
+# Compiler Spec: clang@8.0.1
+# CMake executable path: /usr/tce/packages/cmake/cmake-3.14.5/bin/cmake
+#------------------
+
+#------------------
+# Compilers
+#------------------
+
+set(CMAKE_C_COMPILER "/usr/tce/packages/clang/clang-8.0.1/bin/clang" CACHE PATH "")
+
+set(CMAKE_CXX_COMPILER "/usr/tce/packages/clang/clang-8.0.1/bin/clang++" CACHE PATH "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-O3" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+#------------------------------------------------------------------------------
+# Cuda
+#------------------------------------------------------------------------------
+
+set(ENABLE_CUDA ON CACHE BOOL "")
+
+set(CUDA_TOOLKIT_ROOT_DIR "/usr/tce/packages/cuda/cuda-10.1.243" CACHE PATH "")
+
+set(CMAKE_CUDA_COMPILER "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc" CACHE PATH "")
+
+set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -Xcompiler -O3 -Xcompiler -fopenmp" CACHE STRING "")
+
+set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O3 -g -lineinfo -Xcompiler -O3 -Xcompiler -fopenmp" CACHE STRING "")
+
+set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g -G" CACHE STRING "")
+
+#------------------------------------------------------------------------------
+# Other
+#------------------------------------------------------------------------------
+
+set(RAJA_RANGE_ALIGN "4" CACHE STRING "")
+
+set(RAJA_RANGE_MIN_LENGTH "32" CACHE STRING "")
+
+set(RAJA_DATA_ALIGN "64" CACHE STRING "")
+
+set(RAJA_HOST_CONFIG_LOADED ON CACHE BOOL "")
+
diff --git a/host-configs/lc-builds/bgqos/clang_4_0_0.cmake b/host-configs/lc-builds/bgqos/clang_4_0_0.cmake
index 81a6110098..01a5e3bdf0 100644
--- a/host-configs/lc-builds/bgqos/clang_4_0_0.cmake
+++ b/host-configs/lc-builds/bgqos/clang_4_0_0.cmake
@@ -19,8 +19,6 @@ set(MPIEXEC_NUMPROC_FLAG "-n" CACHE PATH "")
 
 set(ENABLE_WRAP_ALL_TESTS_WITH_MPIEXEC TRUE CACHE BOOL "Ensures that tests will be wrapped with srun to run on the backend nodes")
 
-set(RAJA_RANGE_ALIGN 4 CACHE INT "")
-set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
 set(RAJA_DATA_ALIGN 64 CACHE INT "")
 
 set(RAJA_HOST_CONFIG_LOADED On CACHE Bool "")
diff --git a/host-configs/lc-builds/blueos/clang_X.cmake b/host-configs/lc-builds/blueos/clang_X.cmake
index f04bd69264..50e7dbac28 100755
--- a/host-configs/lc-builds/blueos/clang_X.cmake
+++ b/host-configs/lc-builds/blueos/clang_X.cmake
@@ -11,8 +11,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "-O3" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
 
-set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
-set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
 set(RAJA_DATA_ALIGN 64 CACHE STRING "")
 
 set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/lc-builds/blueos/clangcuda_upstream_2018_12_03_nvcc_9_2.cmake b/host-configs/lc-builds/blueos/clangcuda_upstream_2018_12_03_nvcc_9_2.cmake
index 5ce2ccbb4f..d3f36540f2 100644
--- a/host-configs/lc-builds/blueos/clangcuda_upstream_2018_12_03_nvcc_9_2.cmake
+++ b/host-configs/lc-builds/blueos/clangcuda_upstream_2018_12_03_nvcc_9_2.cmake
@@ -16,8 +16,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "-O3" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
 
-set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
-set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
 set(RAJA_DATA_ALIGN 64 CACHE STRING "")
 
 set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/lc-builds/blueos/gcc_X.cmake b/host-configs/lc-builds/blueos/gcc_X.cmake
index da4d104c04..81c7bcc411 100755
--- a/host-configs/lc-builds/blueos/gcc_X.cmake
+++ b/host-configs/lc-builds/blueos/gcc_X.cmake
@@ -11,8 +11,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -finline-functions -finline-limit=20000" CAC
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -g -finline-functions -finline-limit=20000" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
 
-set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
-set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
 set(RAJA_DATA_ALIGN 64 CACHE STRING "")
 
 set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/lc-builds/blueos/nvcc_clang_X.cmake b/host-configs/lc-builds/blueos/nvcc_clang_X.cmake
index e6e5d24182..9c356e1e83 100755
--- a/host-configs/lc-builds/blueos/nvcc_clang_X.cmake
+++ b/host-configs/lc-builds/blueos/nvcc_clang_X.cmake
@@ -17,8 +17,6 @@ set(CMAKE_CUDA_FLAGS_RELEASE "-O3 ${HOST_OPT_FLAGS}" CACHE STRING "")
 set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0" CACHE STRING "")
 set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo -O3 ${HOST_OPT_FLAGS}" CACHE STRING "")
 
-set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
-set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
 set(RAJA_DATA_ALIGN 64 CACHE STRING "")
 
 set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/lc-builds/blueos/nvcc_gcc_X.cmake b/host-configs/lc-builds/blueos/nvcc_gcc_X.cmake
index bc99e7ce4c..c2e5948640 100755
--- a/host-configs/lc-builds/blueos/nvcc_gcc_X.cmake
+++ b/host-configs/lc-builds/blueos/nvcc_gcc_X.cmake
@@ -17,8 +17,6 @@ set(CMAKE_CUDA_FLAGS_RELEASE "-O3 ${HOST_OPT_FLAGS}" CACHE STRING "")
 set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0" CACHE STRING "")
 set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo -O3 ${HOST_OPT_FLAGS}" CACHE STRING "")
 
-set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
-set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
 set(RAJA_DATA_ALIGN 64 CACHE STRING "")
 
 set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/lc-builds/blueos/nvcc_xl_2019_X.cmake b/host-configs/lc-builds/blueos/nvcc_xl_X.cmake
similarity index 93%
rename from host-configs/lc-builds/blueos/nvcc_xl_2019_X.cmake
rename to host-configs/lc-builds/blueos/nvcc_xl_X.cmake
index 2eebc6091e..8b6662a862 100755
--- a/host-configs/lc-builds/blueos/nvcc_xl_2019_X.cmake
+++ b/host-configs/lc-builds/blueos/nvcc_xl_X.cmake
@@ -23,8 +23,6 @@ set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo -O3 ${HOST_OPT_FLAGS}" CACHE S
 # - 1500-036 nostrict optimizations may alter code semantics
 #   (can be countered with -qstrict, with less optimization)
 
-set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
-set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
 set(RAJA_DATA_ALIGN 64 CACHE STRING "")
 
 set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/lc-builds/blueos/pgi_X.cmake b/host-configs/lc-builds/blueos/pgi_X.cmake
index 253135fa71..f746940489 100755
--- a/host-configs/lc-builds/blueos/pgi_X.cmake
+++ b/host-configs/lc-builds/blueos/pgi_X.cmake
@@ -11,8 +11,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fast -mp" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-fast -g -mp" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -mp" CACHE STRING "")
 
-set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
-set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
 set(RAJA_DATA_ALIGN 64 CACHE STRING "")
 
 set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/lc-builds/blueos/xl_2019_X.cmake b/host-configs/lc-builds/blueos/xl_2020_X.cmake
similarity index 92%
rename from host-configs/lc-builds/blueos/xl_2019_X.cmake
rename to host-configs/lc-builds/blueos/xl_2020_X.cmake
index 4973bbb431..c04f835145 100755
--- a/host-configs/lc-builds/blueos/xl_2019_X.cmake
+++ b/host-configs/lc-builds/blueos/xl_2020_X.cmake
@@ -17,8 +17,6 @@ set(CMAKE_EXE_LINKER_FLAGS "-Wl,-z,muldefs" CACHE STRING "")
 # - 1500-036 nostrict optimizations may alter code semantics
 #   (can be countered with -qstrict, with less optimization)
 
-set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
-set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
 set(RAJA_DATA_ALIGN 64 CACHE STRING "")
 
 set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/lc-builds/blueos/xl_X.cmake b/host-configs/lc-builds/blueos/xl_X.cmake
new file mode 100755
index 0000000000..c04f835145
--- /dev/null
+++ b/host-configs/lc-builds/blueos/xl_X.cmake
@@ -0,0 +1,23 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+set(RAJA_COMPILER "RAJA_COMPILER_XLC" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -qsmp=omp:noopt " CACHE STRING "")
+set(CMAKE_EXE_LINKER_FLAGS "-Wl,-z,muldefs" CACHE STRING "")
+
+# Suppressed XLC warnings:
+# - 1500-029 cannot inline
+# - 1500-036 nostrict optimizations may alter code semantics
+#   (can be countered with -qstrict, with less optimization)
+
+set(RAJA_DATA_ALIGN 64 CACHE STRING "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
+
diff --git a/host-configs/lc-builds/toss3/clang_X.cmake b/host-configs/lc-builds/toss3/clang_X.cmake
index beedc17ea6..506bce066d 100755
--- a/host-configs/lc-builds/toss3/clang_X.cmake
+++ b/host-configs/lc-builds/toss3/clang_X.cmake
@@ -11,8 +11,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "-O3 -msse4.2 -funroll-loops -finline-functions" CAC
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -msse4.2 -funroll-loops -finline-functions" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
 
-set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
-set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
 set(RAJA_DATA_ALIGN 64 CACHE STRING "")
 
 set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/lc-builds/toss3/clangcuda_6_0_0_nvcc_8_0.cmake b/host-configs/lc-builds/toss3/clangcuda_6_0_0_nvcc_8_0.cmake
index b769677d16..56f0ba9320 100644
--- a/host-configs/lc-builds/toss3/clangcuda_6_0_0_nvcc_8_0.cmake
+++ b/host-configs/lc-builds/toss3/clangcuda_6_0_0_nvcc_8_0.cmake
@@ -16,8 +16,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "-O3 -msse4.2 -funroll-loops -finline-functions" CAC
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -msse4.2 -funroll-loops -finline-functions" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
 
-set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
-set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
 set(RAJA_DATA_ALIGN 64 CACHE INT "")
 
 set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/lc-builds/toss3/gcc_X.cmake b/host-configs/lc-builds/toss3/gcc_X.cmake
index da4d104c04..81c7bcc411 100755
--- a/host-configs/lc-builds/toss3/gcc_X.cmake
+++ b/host-configs/lc-builds/toss3/gcc_X.cmake
@@ -11,8 +11,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -finline-functions -finline-limit=20000" CAC
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -g -finline-functions -finline-limit=20000" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
 
-set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
-set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
 set(RAJA_DATA_ALIGN 64 CACHE STRING "")
 
 set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/lc-builds/toss3/hip.cmake b/host-configs/lc-builds/toss3/hip.cmake
new file mode 100644
index 0000000000..e0de15ac9b
--- /dev/null
+++ b/host-configs/lc-builds/toss3/hip.cmake
@@ -0,0 +1,28 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+set(RAJA_COMPILER "RAJA_COMPILER_CLANG" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-O2" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+set(HIP_COMMON_OPT_FLAGS )
+set(HIP_COMMON_DEBUG_FLAGS)
+set(HOST_OPT_FLAGS)
+
+if(CMAKE_BUILD_TYPE MATCHES Release)
+  set(RAJA_HIPCC_FLAGS "-fPIC -O2 ${HIP_COMMON_OPT_FLAGS} ${HOST_OPT_FLAGS}" CACHE STRING "")
+elseif(CMAKE_BUILD_TYPE MATCHES RelWithDebInfo)
+  set(RAJA_HIPCC_FLAGS "-fPIC -g -O2 ${HIP_COMMON_OPT_FLAGS} ${HOST_OPT_FLAGS}" CACHE STRING "")
+elseif(CMAKE_BUILD_TYPE MATCHES Debug)
+  set(RAJA_HIPCC_FLAGS "-fPIC -g -O0 ${HIP_COMMON_DEBUG_FLAGS}" CACHE STRING "")
+endif()
+
+set(RAJA_DATA_ALIGN 64 CACHE STRING "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/lc-builds/toss3/icpc_X_gcc7headers.cmake b/host-configs/lc-builds/toss3/icpc_X_gcc7headers.cmake
index 877d4f3189..3e7e3a7675 100755
--- a/host-configs/lc-builds/toss3/icpc_X_gcc7headers.cmake
+++ b/host-configs/lc-builds/toss3/icpc_X_gcc7headers.cmake
@@ -13,8 +13,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} -O3 -fp-model source -unroll-aggres
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g -fp-model source -unroll-aggressive -finline-functions -axCORE-AVX2 -diag-disable cpu-dispatch" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} -O0 -g" CACHE STRING "")
 
-set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
-set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
 set(RAJA_DATA_ALIGN 64 CACHE STRING "")
 
 set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake b/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake
index 46b1ac878f..ae34c4e6da 100755
--- a/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake
+++ b/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake
@@ -13,8 +13,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} -O3 -march=native -ansi-alias -axCO
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g -march=native -ansi-alias -axCORE-AVX2 -diag-disable cpu-dispatch" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} -O0 -g" CACHE STRING "")
 
-set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
-set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
 set(RAJA_DATA_ALIGN 64 CACHE STRING "")
 
 set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/lc-builds/toss3/pgi_X.cmake b/host-configs/lc-builds/toss3/pgi_X.cmake
index 7a8f29d98f..a54a39c0e8 100755
--- a/host-configs/lc-builds/toss3/pgi_X.cmake
+++ b/host-configs/lc-builds/toss3/pgi_X.cmake
@@ -11,8 +11,6 @@ set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fast -mp" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -fast -mp" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -mp" CACHE STRING "")
 
-set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
-set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
 set(RAJA_DATA_ALIGN 64 CACHE STRING "")
 
 set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/macos-builds/clang_X.cmake b/host-configs/macos-builds/clang_X.cmake
new file mode 100755
index 0000000000..f10479f54d
--- /dev/null
+++ b/host-configs/macos-builds/clang_X.cmake
@@ -0,0 +1,18 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+set(RAJA_COMPILER "RAJA_COMPILER_CLANG" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -funroll-loops -finline-functions" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -march=native -funroll-loops -finline-functions" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
+set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
+set(RAJA_DATA_ALIGN 64 CACHE STRING "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/quartz-toss_3_x86_64_ib-clang@9.0.0.cmake b/host-configs/quartz-toss_3_x86_64_ib-clang@9.0.0.cmake
new file mode 100644
index 0000000000..3af54ba3b8
--- /dev/null
+++ b/host-configs/quartz-toss_3_x86_64_ib-clang@9.0.0.cmake
@@ -0,0 +1,44 @@
+###################
+# Generated host-config - Edit at own risk!
+###################
+# Copyright (c) 2020, Lawrence Livermore National Security, LLC and
+# other Umpire Project Developers. See the top-level LICENSE file for
+# details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause) 
+###################
+
+#------------------
+# SYS_TYPE: toss_3_x86_64_ib
+# Compiler Spec: clang@9.0.0
+# CMake executable path: /usr/tce/packages/cmake/cmake-3.14.5/bin/cmake
+#------------------
+
+#------------------
+# Compilers
+#------------------
+
+set(CMAKE_C_COMPILER "/usr/tce/packages/clang/clang-9.0.0/bin/clang" CACHE PATH "")
+
+set(CMAKE_CXX_COMPILER "/usr/tce/packages/clang/clang-9.0.0/bin/clang++" CACHE PATH "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -msse4.2 -funroll-loops -finline-functions" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -msse4.2 -funroll-loops -finline-functions" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+set(ENABLE_CUDA OFF CACHE BOOL "")
+
+#------------------------------------------------------------------------------
+# Other
+#------------------------------------------------------------------------------
+
+set(RAJA_RANGE_ALIGN "4" CACHE STRING "")
+
+set(RAJA_RANGE_MIN_LENGTH "32" CACHE STRING "")
+
+set(RAJA_DATA_ALIGN "64" CACHE STRING "")
+
+set(RAJA_HOST_CONFIG_LOADED ON CACHE BOOL "")
+
diff --git a/host-configs/ubuntu-builds/clang_X.cmake b/host-configs/ubuntu-builds/clang_X.cmake
new file mode 100644
index 0000000000..beedc17ea6
--- /dev/null
+++ b/host-configs/ubuntu-builds/clang_X.cmake
@@ -0,0 +1,18 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+set(RAJA_COMPILER "RAJA_COMPILER_CLANG" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -msse4.2 -funroll-loops -finline-functions" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -msse4.2 -funroll-loops -finline-functions" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
+set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
+set(RAJA_DATA_ALIGN 64 CACHE STRING "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/ubuntu-builds/gcc_X.cmake b/host-configs/ubuntu-builds/gcc_X.cmake
new file mode 100644
index 0000000000..da4d104c04
--- /dev/null
+++ b/host-configs/ubuntu-builds/gcc_X.cmake
@@ -0,0 +1,18 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+set(RAJA_COMPILER "RAJA_COMPILER_GNU" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -finline-functions -finline-limit=20000" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -g -finline-functions -finline-limit=20000" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
+set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
+set(RAJA_DATA_ALIGN 64 CACHE STRING "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/hip.cmake b/host-configs/ubuntu-builds/hip.cmake
similarity index 61%
rename from host-configs/hip.cmake
rename to host-configs/ubuntu-builds/hip.cmake
index 8c020ac69b..7109eef7e3 100644
--- a/host-configs/hip.cmake
+++ b/host-configs/ubuntu-builds/hip.cmake
@@ -11,10 +11,10 @@ set(ENABLE_HIP ON CACHE BOOL "")
 set(ENABLE_OPENMP OFF CACHE BOOL "")
 set(ENABLE_CUDA Off CACHE BOOL "")
 
-set(HIP_ROOT_DIR "/opt/rocm/hip" CACHE PATH "HIP ROOT directory path")
+set(HIP_ROOT_DIR "${ROCM_DIR}/hip" CACHE PATH "HIP ROOT directory path")
 
-set(CMAKE_CXX_COMPILER "g++" CACHE PATH "")
-set(CMAKE_C_COMPILER "gcc" CACHE PATH "")
+set(CMAKE_CXX_COMPILER "/usr/bin/g++" CACHE PATH "")
+set(CMAKE_C_COMPILER "/usr/bin/gcc" CACHE PATH "")
 
 set(CMAKE_CXX_FLAGS_RELEASE "-O2" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g" CACHE STRING "")
@@ -30,16 +30,15 @@ if (ENABLE_OPENMP)
 endif()
 
 if(CMAKE_BUILD_TYPE MATCHES Release)
-  set(RAJA_HIPCC_FLAGS -O2; ${HIP_COMMON_OPT_FLAGS}; ${HOST_OPT_FLAGS} CACHE LIST "")
+  set(RAJA_HIPCC_FLAGS "-O2 ${HIP_COMMON_OPT_FLAGS} ${HOST_OPT_FLAGS}" CACHE STRING "")
 elseif(CMAKE_BUILD_TYPE MATCHES RelWithDebInfo)
-  set(RAJA_HIPCC_FLAGS -g; -G; -O2; ${HIP_COMMON_OPT_FLAGS}; ${HOST_OPT_FLAGS} CACHE LIST "")
+  set(RAJA_HIPCC_FLAGS "-g -O2 ${HIP_COMMON_OPT_FLAGS} ${HOST_OPT_FLAGS}" CACHE STRING "")
 elseif(CMAKE_BUILD_TYPE MATCHES Debug)
-  set(RAJA_HIPCC_FLAGS -g; -G; -O0; ${HIP_COMMON_DEBUG_FLAGS}; CACHE LIST "")
+  set(RAJA_HIPCC_FLAGS "-g -O0 ${HIP_COMMON_DEBUG_FLAGS}" CACHE STRING "")
 endif()
 
-set(RAJA_RANGE_ALIGN 4 CACHE INT "")
-set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
-set(RAJA_DATA_ALIGN 64 CACHE INT "")
-set(RAJA_COHERENCE_BLOCK_SIZE 64 CACHE INT "")
+set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
+set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
+set(RAJA_DATA_ALIGN 64 CACHE STRING "")
 
-set(RAJA_HOST_CONFIG_LOADED On CACHE Bool "")
+set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/ubuntu-builds/nvcc_gcc_X.cmake b/host-configs/ubuntu-builds/nvcc_gcc_X.cmake
new file mode 100644
index 0000000000..bc99e7ce4c
--- /dev/null
+++ b/host-configs/ubuntu-builds/nvcc_gcc_X.cmake
@@ -0,0 +1,24 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+set(RAJA_COMPILER "RAJA_COMPILER_GNU" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -finline-functions" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -g -finline-functions" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+set(HOST_OPT_FLAGS "-Xcompiler -O3 -Xcompiler -finline-functions -Xcompiler -fopenmp")
+
+set(CMAKE_CUDA_FLAGS_RELEASE "-O3 ${HOST_OPT_FLAGS}" CACHE STRING "")
+set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0" CACHE STRING "")
+set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo -O3 ${HOST_OPT_FLAGS}" CACHE STRING "")
+
+set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
+set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
+set(RAJA_DATA_ALIGN 64 CACHE STRING "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp
index e0ef93d556..ea04033775 100644
--- a/include/RAJA/RAJA.hpp
+++ b/include/RAJA/RAJA.hpp
@@ -43,6 +43,7 @@
 //
 #include "RAJA/pattern/forall.hpp"
 #include "RAJA/pattern/kernel.hpp"
+#include "RAJA/pattern/teams.hpp"
 
 
 //
@@ -108,6 +109,16 @@
 #include "RAJA/util/View.hpp"
 
 
+//
+// View for sequences of objects
+//
+#include "RAJA/util/Span.hpp"
+
+//
+// zip iterator to iterator over sequences simultaneously
+//
+#include "RAJA/util/zip.hpp"
+
 //
 // Atomic operations support
 //
@@ -123,6 +134,17 @@
 //
 #include "RAJA/util/BitMask.hpp"
 
+//
+// sort algorithms
+//
+#include "RAJA/util/sort.hpp"
+
+//
+// WorkPool, WorkGroup, WorkSite objects
+//
+#include "RAJA/policy/WorkGroup.hpp"
+#include "RAJA/pattern/WorkGroup.hpp"
+
 //
 // Reduction objects
 //
@@ -148,7 +170,12 @@
 //
 
 #include "RAJA/index/IndexSetUtils.hpp"
+#include "RAJA/index/IndexSetBuilders.hpp"
 
 #include "RAJA/pattern/scan.hpp"
 
+#include "RAJA/util/PluginLinker.hpp"
+
+#include "RAJA/pattern/sort.hpp"
+
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in
index a1f7847fed..91dc8c56a8 100644
--- a/include/RAJA/config.hpp.in
+++ b/include/RAJA/config.hpp.in
@@ -39,7 +39,7 @@
  ******************************************************************************
  */
 #cmakedefine ENABLE_FT
-
+#cmakedefine ENABLE_ITERATOR_OVERFLOW_DEBUG
 /*!
  ******************************************************************************
  *
@@ -77,10 +77,19 @@
  */
 #cmakedefine RAJA_ENABLE_BOUNDS_CHECK
 
+/*
+ ******************************************************************************
+ *
+ * \brief Exhaustive index types for tests
+ *
+ ******************************************************************************
+ */
+#cmakedefine RAJA_TEST_EXHAUSTIVE
+
 /*!
  ******************************************************************************
  *
- * \brief Programming model back-ends, plus CHAI enable/disable.
+ * \brief Programming model back-ends.
  *
  ******************************************************************************
  */
@@ -91,6 +100,8 @@
 #cmakedefine RAJA_ENABLE_CLANG_CUDA
 #cmakedefine RAJA_ENABLE_HIP
 
+#cmakedefine RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL
+
 /*!
  ******************************************************************************
  *
@@ -135,11 +146,21 @@
 namespace RAJA {
 
 #if defined(RAJA_ENABLE_OPENMP)
-#if not defined(_OPENMP)
-#error RAJA configured with ENABLE_OPENMP, but OpenMP not supported by current compiler 
+#if defined(_OPENMP)
+#if _OPENMP >= 200805
+#define RAJA_ENABLE_OPENMP_TASK
+#endif
+#else
+#error RAJA configured with ENABLE_OPENMP, but OpenMP not supported by current compiler
 #endif // _OPENMP
 #endif // RAJA_ENABLE_OPENMP
 
+#if defined(RAJA_ENABLE_CUDA)
+#if not defined(__CUDACC__)
+#error RAJA configured with ENABLE_CUDA, but CUDA not supported by current compiler 
+#endif // 
+#endif // RAJA_ENABLE_CUDA
+
 
 /*!
  ******************************************************************************
@@ -162,23 +183,11 @@ namespace RAJA {
  */
 
 //
-//  Platform-specific constants for range index set and data alignment:
-//
-//     RANGE_ALIGN - alignment of begin/end indices in range segments
-//                   (i.e., starting index and length of range segments
-//                    constructed by index set builder methods will
-//                    be multiples of this value)
-//
-//     RANGE_MIN_LENGTH - used in index set builder methods
-//                        as min length of range segments (an integer multiple
-//                        of RANGE_ALIGN)
+//  Platform-specific constants for data alignment:
 //
 //     DATA_ALIGN - used in compiler-specific intrinsics and type aliases
 //                  to specify alignment of data, loop bounds, etc.;
 //                  units of "bytes"
-
-const int RANGE_ALIGN = @RAJA_RANGE_ALIGN@;
-const int RANGE_MIN_LENGTH = @RAJA_RANGE_MIN_LENGTH@;
 const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
 
 #if defined (_WIN32)
@@ -187,32 +196,20 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
 #define RAJA_RESTRICT __restrict__
 #endif
 
+#if !defined(RAJA_COMPILER_MSVC)
+#define RAJA_COLLAPSE(X) collapse(X)
+#else
+#define RAJA_COLLAPSE(X)
+#endif
+
 //
 // Runtime bounds checking for Views
 //
 #if defined(RAJA_ENABLE_BOUNDS_CHECK)
 #define RAJA_BOUNDS_CHECK_INTERNAL
 #define RAJA_BOUNDS_CHECK_constexpr
-
-#if !defined(NDEBUG)
-#define RAJA_ASSERT(EXP) assert( (EXP) )
-#endif
-
-//
-//TODO: Once HIP is supported use asm("s_trap 2");
-//      to halt HIP kernels.
-//
-#if defined(NDEBUG) && defined(__CUDA_ARCH__)
-#define RAJA_ASSERT(EXP) asm ("trap;")
-#endif
-
-#if defined(NDEBUG) && !defined(__CUDA_ARCH__)
-#define RAJA_ASSERT(EXP) abort();
-#endif
-
 #else
 #define RAJA_BOUNDS_CHECK_constexpr constexpr
-#define RAJA_ASSERT(EXP)
 #endif
 
 //
@@ -299,11 +296,11 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
 
 #if defined(_OPENMP) && (_OPENMP >= 201307)
 #define RAJA_SIMD  RAJA_PRAGMA(omp simd)
-#define RAJA_NO_SIMD 
+#define RAJA_NO_SIMD
 #elif defined(__GNUC__) && defined(__GNUC_MINOR__) && \
       ( ( (__GNUC__ == 4) && (__GNUC_MINOR__ == 9) ) || (__GNUC__ >= 5) )
 #define RAJA_SIMD    RAJA_PRAGMA(GCC ivdep)
-#define RAJA_NO_SIMD 
+#define RAJA_NO_SIMD
 #else
 #define RAJA_SIMD
 #define RAJA_NO_SIMD
@@ -357,13 +354,24 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
 // Apple Clang compiler supports older options
 #if ( ( (__clang_major__ >= 4 ) ||  (__clang_major__ >= 3 && __clang_minor__ > 7) ) && !defined(__APPLE__) )
 #define RAJA_SIMD    RAJA_PRAGMA(clang loop vectorize(assume_safety))
-#else 
+#else
 #define RAJA_SIMD    RAJA_PRAGMA(clang loop vectorize(enable))
 #endif
 
 #define RAJA_NO_SIMD  RAJA_PRAGMA(clang loop vectorize(disable))
 #endif
 
+
+// This is the same as undefined compiler, but squelches the warning message
+#elif defined(RAJA_COMPILER_MSVC)
+
+#define RAJA_FORCEINLINE_RECURSIVE
+#define RAJA_INLINE inline
+#define RAJA_ALIGN_DATA(d) d
+#define RAJA_SIMD
+#define RAJA_NO_SIMD
+
+
 #else
 
 #pragma message("RAJA_COMPILER unknown, using default empty macros.")
@@ -400,7 +408,7 @@ T * align_hint(T * x)
   return static_cast<T *>(RAJA_ALIGN_DATA(x));
 #endif
 }
-    
+
 }  // closing brace for RAJA namespace
 
 #endif // closing endif for header file include guard
diff --git a/include/RAJA/index/IndexSet.hpp b/include/RAJA/index/IndexSet.hpp
index 5897edefbf..d95e43a871 100644
--- a/include/RAJA/index/IndexSet.hpp
+++ b/include/RAJA/index/IndexSet.hpp
@@ -90,7 +90,12 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
                 "All segments must have the same value_type");
 
   //! Construct empty index set
+#if _MSC_VER < 1910
+   // this one instance of constexpr does not work on VS2012 or VS2015
+  RAJA_INLINE TypedIndexSet() : PARENT() {}
+#else
   RAJA_INLINE constexpr TypedIndexSet() : PARENT() {}
+#endif
 
   //! Copy-constructor for index set
   RAJA_INLINE
@@ -232,12 +237,15 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   {
     Index_type num = getNumSegments();
 
-    RangeStrideSegment Iter = (pend == PUSH_BACK)
-                                  ? RangeStrideSegment(0, num, 1)
-                                  : RangeStrideSegment(num - 1, -1, -1);
-
-    for (Index_type i : Iter)
-      segment_push_into(i, c, pend, pcopy);
+    if (pend == PUSH_BACK) {
+      for (Index_type i = 0; i < num; ++i) {
+        segment_push_into(i, c, pend, pcopy);
+      } 
+    } else {
+      for (Index_type i = num-1; i > -1; --i) {
+        segment_push_into(i, c, pend, pcopy);
+      } 
+    }
   }
 
 
diff --git a/include/RAJA/index/IndexSetBuilders.hpp b/include/RAJA/index/IndexSetBuilders.hpp
index 1202a1a554..60e8c160e0 100644
--- a/include/RAJA/index/IndexSetBuilders.hpp
+++ b/include/RAJA/index/IndexSetBuilders.hpp
@@ -21,33 +21,46 @@
 #include "RAJA/config.hpp"
 
 #include "RAJA/index/IndexSet.hpp"
+#include "RAJA/index/ListSegment.hpp"
+#include "RAJA/index/RangeSegment.hpp"
 
 #include "RAJA/util/types.hpp"
 
+#include "camp/resource.hpp"
+
 namespace RAJA
 {
 
 /*!
  ******************************************************************************
  *
- * \brief Initialize index set with aligned Ranges and List segments from
- *        array of indices with given length.
- *
- *        Specifically, Range segments will be greater than RANGE_MIN_LENGTH
- *        and starting index and length of each range segment will be
- *        multiples of RANGE_ALIGN. These constants are defined in the
- *        RAJA config.hpp header file.
+ * \brief Generate an index set with aligned Range segments and List segments,
+ *        as needed, from given array of indices.
  *
- *        Routine does no error-checking on argements and assumes Index_type
- *        array contains valid indices.
+ *        Routine does no error-checking on argements and assumes 
+ *        RAJA::Index_type array contains valid indices.
  *
- * Note: Method assumes TypedIndexSet reference refers to an empty index set.
+ *  \param iset reference to index set generated with aligned range segments 
+ *         and list segments. Method assumes index set is empty (no segments).
+ *  \param work_res camp resource object that identifies the memory space in 
+ *         which list segment index data will live (passed to list segment 
+ *         ctor).
+ *  \param indices_in pointer to start of input array of indices.
+ *  \param length size of input index array.
+ *  \param range_min_length min length of any range segment in index set
+ *  \param range_align "alignment" value for range segments in index set.
+ *         Starting index each range segment will be a multiple of this value.
  *
  ******************************************************************************
  */
-void buildTypedIndexSetAligned(IndexSet& hiset,
-                               const Index_type* const indices_in,
-                               Index_type length);
+void buildIndexSetAligned(
+    RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment>& iset,
+    camp::resources::Resource& work_res,
+    const RAJA::Index_type* const indices_in,
+    RAJA::Index_type length,
+    RAJA::Index_type range_min_length,
+    RAJA::Index_type range_align);
+
 
 ////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
@@ -62,47 +75,56 @@ void buildTypedIndexSetAligned(IndexSet& hiset,
 ////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
 
-/*
+/*!
  ******************************************************************************
  *
- * Initialize lock-free "block" index set (planar division).
+ * \brief Generate a lock-free "block" index set (planar division) containing
+ *        range segments. 
  *
- * The method chunks a fastDim x midDim x slowDim mesh into blocks that can
- * be dependency-scheduled, removing need for lock constructs.
+ *        The method chunks a fastDim x midDim x slowDim mesh into blocks that 
+ *        can be dependency-scheduled, removing need for lock constructs.
  *
- * Note: Method assumes TypedIndexSet reference refers to an empty index set.
+ *  \param iset reference to index set generated with range segments.
+ *         Method assumes index set is empty (no segments). 
+ *  \param fastDim "fast" block dimension (see above).
+ *  \param midDim  "mid" block dimension (see above).
+ *  \param slowDim "slow" block dimension (see above).
  *
  ******************************************************************************
  */
 void buildLockFreeBlockIndexset(
-    RAJA::TypedIndexSet<RAJA::RangeSegment,
-                        RAJA::ListSegment,
-                        RAJA::RangeStrideSegment>& iset,
+    RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
     int fastDim,
     int midDim,
     int slowDim);
 
-/*
+/*!
  ******************************************************************************
  *
- * Build Lock-free "color" index set. The domain-set is colored based on
- * connectivity to the range-set. All elements in each segment are
- * independent, and no two segments can be executed in parallel.
+ * \brief Generate a lock-free "color" index set containing range and list
+ *        segments.
+ * 
+ *        TThe domain-set is colored based on connectivity to the range-set. 
+ *        All elements in each segment are independent, and no two segments 
+ *        can be executed in parallel.
  *
- * Note: Method assumes TypedIndexSet reference refers to an empty index set.
+ * \param iset reference to index set generated. Method assumes index set 
+ *        is empty (no segments). 
+ * \param work_res camp resource object that identifies the memory space in
+ *         which list segment index data will live (passed to list segment
+ *         ctor).
  *
  ******************************************************************************
  */
 void buildLockFreeColorIndexset(
-    RAJA::TypedIndexSet<RAJA::RangeSegment,
-                        RAJA::ListSegment,
-                        RAJA::RangeStrideSegment>& iset,
-    Index_type const* domainToRange,
+    RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment>& iset,
+    camp::resources::Resource& work_res,
+    RAJA::Index_type const* domainToRange,
     int numEntity,
     int numRangePerDomain,
     int numEntityRange,
-    Index_type* elemPermutation = 0l,
-    Index_type* ielemPermutation = 0l);
+    RAJA::Index_type* elemPermutation = nullptr,
+    RAJA::Index_type* ielemPermutation = nullptr);
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/index/IndexValue.hpp b/include/RAJA/index/IndexValue.hpp
index 2f815969d4..e863978c86 100644
--- a/include/RAJA/index/IndexValue.hpp
+++ b/include/RAJA/index/IndexValue.hpp
@@ -356,6 +356,19 @@ struct StripIndexTypeT<FROM, typename std::enable_if<!std::is_base_of<IndexValue
 template<typename FROM>
 using strip_index_type_t = typename internal::StripIndexTypeT<FROM>::type;
 
+/*!
+ * \brief Converts a type into a signed type. Also handles floating point
+ * types as std::make_signed only supports integral types.
+ *
+ * \param FROM the original type
+ */
+template<typename FROM>
+using make_signed_t = typename std::conditional < 
+                                  std::is_floating_point<FROM>::value,
+                                    std::common_type<FROM>,
+                                    std::make_signed<FROM>
+                               >::type::type;
+
 }  // namespace RAJA
 
 /*!
@@ -381,18 +394,17 @@ using strip_index_type_t = typename internal::StripIndexTypeT<FROM>::type;
 /*!
  * \brief Helper Macro to create new Index types.
  * \param TYPE the name of the type
+ * \param IDXT the index types value type
  * \param NAME a string literal to identify this index type
  */
 #define RAJA_INDEX_VALUE_T(TYPE, IDXT, NAME)                         \
   class TYPE : public ::RAJA::IndexValue<TYPE, IDXT>                 \
   {                                                                  \
-    using parent = ::RAJA::IndexValue<TYPE, IDXT>;                   \
-                                                                     \
   public:                                                            \
-    using IndexValueType = TYPE;                                     \
-    RAJA_HOST_DEVICE RAJA_INLINE TYPE() : parent::IndexValue() {}    \
-    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(IDXT v) \
-        : parent::IndexValue(v)                                      \
+    RAJA_HOST_DEVICE RAJA_INLINE TYPE()                              \
+        : RAJA::IndexValue<TYPE,IDXT>::IndexValue() {}               \
+    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(IDXT v)               \
+        : RAJA::IndexValue<TYPE,IDXT>::IndexValue(v)                 \
     {                                                                \
     }                                                                \
     static inline std::string getName() { return NAME; }             \
diff --git a/include/RAJA/index/ListSegment.hpp b/include/RAJA/index/ListSegment.hpp
index 06efda5b33..0f5ad36e2e 100644
--- a/include/RAJA/index/ListSegment.hpp
+++ b/include/RAJA/index/ListSegment.hpp
@@ -24,10 +24,11 @@
 #include <type_traits>
 #include <utility>
 
-#include "RAJA/internal/Span.hpp"
+#include "camp/resource.hpp"
 
 #include "RAJA/util/concepts.hpp"
 #include "RAJA/util/macros.hpp"
+#include "RAJA/util/Span.hpp"
 #include "RAJA/util/types.hpp"
 
 #if (defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))) && defined(RAJA_ENABLE_CUDA)
@@ -61,7 +62,14 @@ namespace RAJA
 template <typename T>
 class TypedListSegment
 {
-
+/*
+ * All of the following down to the 'public' section is original machinery 
+ * to manage segment index data using CUDA or HIP unified memory. Eventually,
+ * it will be removed, but is left in place for now to preserve original
+ * behavior so our tests don't need to be reworked en masse now and users
+ * won't see any different usage or behavior.
+ */
+  
 #if ((defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))) && defined(RAJA_ENABLE_CUDA)) || defined(RAJA_ENABLE_HIP)
   static constexpr bool Has_GPU = true;
 #else
@@ -117,6 +125,7 @@ class TypedListSegment
     cudaErrchk(cudaMemcpy(
         m_data, &(*src.begin()), m_size * sizeof(T), cudaMemcpyDefault));
   }
+
 #elif defined(RAJA_ENABLE_HIP)
   //! copy data from container using BlockCopy
   template <typename Container>
@@ -166,20 +175,103 @@ class TypedListSegment
   //! prevent compiler from providing a default constructor
   TypedListSegment() = delete;
 
+/*
+ * The following two constructors allow users to specify a camp resource
+ * for each list segment, which be used to manage segment index data.
+ *
+ * Eventually, I think it would be better to add a template parameter for
+ * this class to specify the camp resource type rather than passing in a 
+ * resource object.
+ */
+
+  ///
+  /// \brief Construct list segment from given array with specified length
+  ///        and use given camp resource to allocate list segment index data
+  ///        if owned by this list segment.
+  ///
+  /// By default the ctor performs a deep copy of array elements.
+  ///
+  /// If 'Unowned' is passed as last argument, the constructed object
+  /// does not own the segment data and will hold a pointer to given
+  /// array's data. In this case, caller must manage object lifetimes properly.
+  ///
+  TypedListSegment(const value_type* values,
+                   Index_type length,
+                   camp::resources::Resource& resource,
+                   IndexOwnership owned = Owned)
+    : m_resource(resource), m_use_resource(true)
+  {
+    initIndexData(m_use_resource,
+                  values, length, owned);
+  }
+
+  ///
+  /// Construct list segment from arbitrary object holding
+  /// indices using a deep copy of given data.
+  ///
+  /// The object must provide methods: begin(), end(), size().
+  ///
+  template <typename Container>
+  TypedListSegment(const Container& container,
+                   camp::resources::Resource& resource)
+    : m_resource(resource), m_use_resource(true),
+      m_owned(Unowned), m_data(nullptr), m_size(container.size())
+  {
+
+    if (m_size > 0) {
+
+      camp::resources::Resource host_res{camp::resources::Host()};
+
+      value_type* tmp = host_res.allocate<value_type>(m_size);
+
+      auto dest = tmp;
+      auto src = container.begin();
+      auto const end = container.end();
+      while (src != end) {
+        *dest = *src;
+        ++dest;
+        ++src;
+      }
+
+      m_data = m_resource.allocate<value_type>(m_size);
+      m_resource.memcpy(m_data, tmp, sizeof(value_type) * m_size);
+      m_owned = Owned;
+
+      host_res.deallocate(tmp);
+
+    }
+  }
+
+
+/*
+ * The following two ctors preserve the original list segment behavior for
+ * CUDA and HIP device memory management. 
+ *
+ * Note that the host resource object created in the member initialization
+ * list is not used. Where memory management routines are shared between
+ * the old way and using camp resources are controlled by the m_use_resource
+ * boolean member.
+ */
+
   ///
   /// \brief Construct list segment from given array with specified length.
   ///
   /// By default the ctor performs deep copy of array elements.
+  ///
   /// If 'Unowned' is passed as last argument, the constructed object
-  /// does not own the segment data and will hold a pointer to given data.
-  /// In this case, caller must manage object lifetimes properly.
+  /// does not own the segment data and will hold a pointer to given
+  /// array's data. In this case, caller must manage object lifetimes properly.
   ///
+  RAJA_DEPRECATE("In next RAJA release, TypedListSegment ctor will require a camp Resource object")
   TypedListSegment(const value_type* values,
                    Index_type length,
                    IndexOwnership owned = Owned)
+    : m_resource(camp::resources::Resource{camp::resources::Host()}),
+      m_use_resource(false),
+      m_owned(Unowned), m_data(nullptr), m_size(0)
   {
-    // future TODO -- change to initializer list somehow
-    initIndexData(values, length, owned);
+    initIndexData(m_use_resource,
+                  values, length, owned);
   }
 
   ///
@@ -189,28 +281,36 @@ class TypedListSegment
   /// The object must provide methods: begin(), end(), size().
   ///
   template <typename Container>
+  RAJA_DEPRECATE("In next RAJA release, TypedListSegment ctor will require a camp Resource object")
   explicit TypedListSegment(const Container& container)
-      : m_data(nullptr), m_size(container.size()), m_owned(Unowned)
+    : m_resource(camp::resources::Resource{camp::resources::Host()}),
+      m_use_resource(false),
+      m_owned(Unowned), m_data(nullptr), m_size(container.size())
   {
-    if (m_size <= 0) return;
-    allocate_and_copy<Has_GPU>(container);
-    m_owned = Owned;
+    if (m_size > 0) {
+      allocate_and_copy<Has_GPU>(container);
+      m_owned = Owned;
+    }
   }
 
   ///
   /// Copy-constructor for list segment.
   ///
   TypedListSegment(const TypedListSegment& other)
+    : m_resource(other.m_resource), m_use_resource(other.m_use_resource),
+      m_owned(Unowned), m_data(nullptr), m_size(0)
   {
-    // future TODO: switch to member initialization list ... somehow
-    initIndexData(other.m_data, other.m_size, other.m_owned);
+    bool from_copy_ctor = true;
+    initIndexData(other.m_use_resource,
+                  other.m_data, other.m_size, other.m_owned, from_copy_ctor);
   }
 
   ///
   /// Move-constructor for list segment.
   ///
   TypedListSegment(TypedListSegment&& rhs)
-      : m_data(rhs.m_data), m_size(rhs.m_size), m_owned(rhs.m_owned)
+    : m_resource(rhs.m_resource), m_use_resource(rhs.m_use_resource),
+      m_owned(rhs.m_owned), m_data(rhs.m_data), m_size(rhs.m_size)
   {
     // make the rhs non-owning so it's destructor won't have any side effects
     rhs.m_owned = Unowned;
@@ -221,8 +321,15 @@ class TypedListSegment
   ///
   ~TypedListSegment()
   {
-    if (m_data == nullptr || m_owned != Owned) return;
-    deallocate(std::integral_constant<bool, Has_GPU>());
+    if (m_data != nullptr && m_owned == Owned) {
+
+      if (m_use_resource) {
+        m_resource.deallocate(m_data);
+      } else {
+        deallocate(std::integral_constant<bool, Has_GPU>());
+      }
+
+    }
   }
 
 
@@ -231,6 +338,8 @@ class TypedListSegment
   ///
   RAJA_HOST_DEVICE void swap(TypedListSegment& other)
   {
+    camp::safe_swap(m_resource, other.m_resource);
+    camp::safe_swap(m_use_resource, other.m_use_resource);
     camp::safe_swap(m_data, other.m_data);
     camp::safe_swap(m_size, other.m_size);
     camp::safe_swap(m_owned, other.m_owned);
@@ -241,6 +350,7 @@ class TypedListSegment
 
   //! accessor to get the begin iterator for a TypedListSegment
   RAJA_HOST_DEVICE iterator begin() const { return m_data; }
+
   //! accessor to retrieve the total number of elements in a TypedListSegment
   RAJA_HOST_DEVICE Index_type size() const { return m_size; }
 
@@ -281,34 +391,77 @@ class TypedListSegment
   // Initialize segment data properly based on whether object
   // owns the index data.
   //
-  void initIndexData(const value_type* container,
+  void initIndexData(bool use_resource,
+                     const value_type* container,
                      Index_type len,
-                     IndexOwnership container_own)
+                     IndexOwnership container_own,
+                     bool from_copy_ctor = false)
   {
-    // empty
+
+    // empty list segment
     if (len <= 0 || container == nullptr) {
       m_data = nullptr;
       m_size = 0;
       m_owned = Unowned;
       return;
     }
-    // some size -- initialize accordingly
+
+    // some non-zero size -- initialize accordingly
     m_size = len;
     m_owned = container_own;
     if (m_owned == Owned) {
-      allocate_and_copy<Has_GPU>(RAJA::impl::make_span(container, len));
+
+      if (use_resource) {
+
+        if ( from_copy_ctor ) {
+
+          m_data = m_resource.allocate<value_type>(m_size);
+          m_resource.memcpy(m_data, container, sizeof(value_type) * m_size); 
+
+        } else {
+
+          camp::resources::Resource host_res{camp::resources::Host()};
+
+          value_type* tmp = host_res.allocate<value_type>(m_size);
+
+          for (Index_type i = 0; i < m_size; ++i) {
+            tmp[i] = container[i];
+          }
+
+          m_data = m_resource.allocate<value_type>(m_size);
+          m_resource.memcpy(m_data, tmp, sizeof(value_type) * m_size);
+
+          host_res.deallocate(tmp);
+
+        }
+
+      } else {
+        allocate_and_copy<Has_GPU>(RAJA::make_span(container, len));
+      }
+
       return;
     }
+ 
+    // list segment accesses container data directly.
     // Uh-oh. Using evil const_cast....
     m_data = const_cast<value_type*>(container);
   }
 
-  //! buffer storage for list data
+
+  // Copy of camp resource passed to ctor
+  camp::resources::Resource m_resource;
+
+  // Boolean indicating whether camp resource is used to manage index data
+  bool m_use_resource;
+
+  // ownership flag to guide data copying/management
+  IndexOwnership m_owned;
+
+  // buffer storage for list data
   value_type* RAJA_RESTRICT m_data;
-  //! size of list segment
+
+  // size of list segment
   Index_type m_size;
-  //! ownership flag to guide data copying/management
-  IndexOwnership m_owned;
 };
 
 //! alias for A TypedListSegment with storage type @Index_type
diff --git a/include/RAJA/index/RangeSegment.hpp b/include/RAJA/index/RangeSegment.hpp
index 39db675e89..df9e187f1b 100644
--- a/include/RAJA/index/RangeSegment.hpp
+++ b/include/RAJA/index/RangeSegment.hpp
@@ -69,8 +69,12 @@ namespace RAJA
  *
  ******************************************************************************
  */
-template <typename StorageT, typename DiffT = strip_index_type_t<StorageT>>
+
+template <typename StorageT, typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
 struct TypedRangeSegment {
+  
+  static_assert(std::is_signed<DiffT>::value, "TypedRangeSegment DiffT requires signed type.");
+  static_assert(!std::is_floating_point<StorageT>::value, "TypedRangeStrideSegment Type must be non floating point.");
 
   //! the underlying iterator type
   using iterator = Iterators::numeric_iterator<StorageT, DiffT>;
@@ -80,15 +84,17 @@ struct TypedRangeSegment {
    */
   using value_type = StorageT;
 
-  using IndexType = StorageT;
+  using IndexType = DiffT;
 
   //! construct a TypedRangeSegment from a begin and end value
   /*!
    * \param[in] begin the starting value (inclusive) for the range
    * \param[in] end the ending value (exclusive) for the range
    */
-  RAJA_HOST_DEVICE constexpr TypedRangeSegment(DiffT begin, DiffT end)
-      : m_begin(iterator{begin}), m_end(iterator{end})
+  using StripStorageT = strip_index_type_t<StorageT>;
+  RAJA_HOST_DEVICE constexpr TypedRangeSegment(StripStorageT begin, StripStorageT end)
+      : m_begin(iterator(begin)), 
+        m_end(begin > end ? m_begin : iterator(end))
   {
   }
 
@@ -145,18 +151,18 @@ struct TypedRangeSegment {
   /*!
    * \return the range (end - begin) of this Segment
    */
-  RAJA_HOST_DEVICE RAJA_INLINE StorageT size() const { return m_end - m_begin; }
+  RAJA_HOST_DEVICE RAJA_INLINE DiffT size() const { return m_end - m_begin; }
 
   //! Create a slice of this instance as a new instance
   /*!
    * \return A new instance spanning *begin() + begin to *begin() + begin +
    * length
    */
-  RAJA_HOST_DEVICE RAJA_INLINE TypedRangeSegment slice(DiffT begin,
+  RAJA_HOST_DEVICE RAJA_INLINE TypedRangeSegment slice(StorageT begin,
                                                        DiffT length) const
   {
-    auto start = m_begin[0] + begin;
-    auto end = start + length > m_end[0] ? m_end[0] : start + length;
+    StorageT start = m_begin[0] + begin;
+    StorageT end = start + length > m_end[0] ? m_end[0] : start + length;
 
     return TypedRangeSegment{stripIndexType(start), stripIndexType(end)};
   }
@@ -172,6 +178,12 @@ struct TypedRangeSegment {
     return m_begin == o.m_begin && m_end == o.m_end;
   }
 
+
+  RAJA_HOST_DEVICE RAJA_INLINE bool operator!=(TypedRangeSegment const& o) const
+  {
+    return !(operator==(o));
+  }
+
 private:
   //! member variable for begin iterator
   iterator m_begin;
@@ -240,9 +252,12 @@ struct TypedRangeSegment {
  *
  ******************************************************************************
  */
-template <typename StorageT, typename DiffT = strip_index_type_t<StorageT>>
+template <typename StorageT, typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
 struct TypedRangeStrideSegment {
 
+  static_assert(std::is_signed<DiffT>::value, "TypedRangeStrideSegment DiffT requires signed type.");
+  static_assert(!std::is_floating_point<StorageT>::value, "TypedRangeStrideSegment Type must be non floating point.");
+  
   //! the underlying iterator type
   using iterator = Iterators::strided_numeric_iterator<StorageT, DiffT>;
 
@@ -252,15 +267,16 @@ struct TypedRangeStrideSegment {
    */
   using value_type = StorageT;
 
-  using IndexType = StorageT;
+  using IndexType = DiffT;
   //! construct a TypedRangeStrideSegment from a begin and end value
   /*!
    * \param[in] begin the starting value (inclusive) for the range
    * \param[in] end the ending value (exclusive) for the range
    * \param[in] stride the increment value for the iteration of the range
    */
-  RAJA_HOST_DEVICE TypedRangeStrideSegment(DiffT begin,
-                                           DiffT end,
+  using StripStorageT = strip_index_type_t<StorageT>;
+  RAJA_HOST_DEVICE TypedRangeStrideSegment(StripStorageT begin,
+                                           StripStorageT end,
                                            DiffT stride)
       : m_begin(iterator(begin, stride)),
         m_end(iterator(end, stride)),
@@ -272,6 +288,13 @@ struct TypedRangeStrideSegment {
 //                (stride > 0 ? value_type{1} : value_type{-1})) /
 //               static_cast<value_type>(stride))
   {
+    // clamp range when the end is unreachable from the beginning without
+    // wrapping
+    if (stride < 0 && end > begin) {
+      m_end = m_begin;
+    } else if (stride > 0 && end < begin) {
+      m_end = m_begin;
+    }
     // if m_size was initialized as negative, that indicates a zero iteration
     // space
     m_size = m_size < DiffT{0} ? DiffT{0} : m_size;
@@ -344,12 +367,12 @@ struct TypedRangeStrideSegment {
    * \return A new instance spanning *begin() + begin * stride to *begin() +
    * (begin + length) * stride
    */
-  RAJA_HOST_DEVICE TypedRangeStrideSegment slice(DiffT begin,
+  RAJA_HOST_DEVICE TypedRangeStrideSegment slice(StorageT begin,
                                                  DiffT length) const
   {
-    auto stride = m_begin.get_stride();
-    auto start = m_begin[0] + begin * stride;
-    auto end = start + stride * length;
+    StorageT stride = m_begin.get_stride();
+    StorageT start = m_begin[0] + begin * stride;
+    StorageT end = start + stride * length;
 
     if (stride > 0) {
       end = end > m_end[0] ? m_end[0] : end;
@@ -440,12 +463,14 @@ RAJA_HOST_DEVICE TypedRangeSegment<Common> make_range(BeginT&& begin,
 template <typename BeginT,
           typename EndT,
           typename StrideT,
-          typename Common = detail::common_type_t<BeginT, EndT, StrideT>>
+          typename Common = detail::common_type_t<BeginT, EndT>>
 RAJA_HOST_DEVICE TypedRangeStrideSegment<Common> make_strided_range(
     BeginT&& begin,
     EndT&& end,
     StrideT&& stride)
 {
+  static_assert(std::is_signed<StrideT>::value, "make_strided_segment : stride must be signed.");
+  static_assert(std::is_same<make_signed_t<EndT>, StrideT>::value, "make_stride_segment : stride and end must be of similar types.");
   return {begin, end, stride};
 }
 
diff --git a/include/RAJA/internal/Iterators.hpp b/include/RAJA/internal/Iterators.hpp
index 2406bedfbe..541519f860 100644
--- a/include/RAJA/internal/Iterators.hpp
+++ b/include/RAJA/internal/Iterators.hpp
@@ -18,12 +18,15 @@
 #ifndef RAJA_ITERATORS_HPP
 #define RAJA_ITERATORS_HPP
 
-#include "RAJA/config.hpp"
-
 #include <iterator>
+#include <limits>
+#include <string>
 #include <type_traits>
+#include <typeinfo>
 #include <utility>
 
+#include "RAJA/config.hpp"
+#include "RAJA/index/IndexValue.hpp"
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
 
@@ -34,6 +37,69 @@ namespace Iterators
 
 // Containers
 
+#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG)
+template <typename LType, typename RType>
+std::string overflow_msg(LType lhs, RType rhs)
+{
+  return "Iterator Overflow detected between operation of :\n\ttype : " +
+         (std::string) typeid(lhs).name() + " val : " + std::to_string(lhs) +
+         "\n\ttype : " + typeid(rhs).name() + " val : " + std::to_string(rhs) +
+         "\n";
+}
+
+template <typename Type, typename DifferenceType>
+RAJA_HOST_DEVICE bool is_addition_overflow(Type lhs, DifferenceType rhs)
+{
+  if (std::is_unsigned<Type>::value) {
+    if ((rhs > 0) && (lhs > std::numeric_limits<Type>::max() - rhs))
+      return true;
+    if ((rhs < 0) && (lhs < std::numeric_limits<Type>::min() - rhs))
+      return true;
+  }
+  return false;
+}
+
+template <typename Type, typename DifferenceType>
+RAJA_HOST_DEVICE bool is_subtraction_overflow(Type lhs,
+                                              DifferenceType rhs,
+                                              bool iterator_on_left = true)
+{
+  if (iterator_on_left) {
+
+    if (std::is_unsigned<Type>::value) {
+      if ((rhs > 0) && (lhs < std::numeric_limits<Type>::min() + rhs))
+        return true;
+      if ((rhs < 0) && (lhs > std::numeric_limits<Type>::max() + rhs))
+        return true;
+    }
+
+  } else {  // Special case where operation is : value(lhs) - iterator(rhs).
+
+    if (std::is_unsigned<DifferenceType>::value) {
+      if ((lhs > 0) && (rhs < std::numeric_limits<DifferenceType>::min() + lhs))
+        return true;
+      if ((lhs < 0)) return true;
+    }
+  }
+  return false;
+}
+
+template <typename Type, typename DifferenceType>
+RAJA_HOST_DEVICE void check_is_addition_overflow(Type lhs, DifferenceType rhs)
+{
+  if (is_addition_overflow(lhs, rhs))
+    throw std::runtime_error(overflow_msg(lhs, rhs));
+}
+
+template <typename Type, typename DifferenceType>
+RAJA_HOST_DEVICE void check_is_subtraction_overflow(Type lhs,
+                                                    DifferenceType rhs)
+{
+  if (is_subtraction_overflow(lhs, rhs))
+    throw std::runtime_error(overflow_msg(lhs, rhs));
+}
+#endif
+
 template <typename Type = Index_type,
           typename DifferenceType = Type,
           typename PointerType = Type*>
@@ -41,20 +107,35 @@ class numeric_iterator
 {
 public:
   using value_type = Type;
+  using stripped_value_type = strip_index_type_t<Type>;
   using difference_type = DifferenceType;
   using pointer = PointerType;
   using reference = value_type&;
   using iterator_category = std::random_access_iterator_tag;
 
-  RAJA_HOST_DEVICE constexpr numeric_iterator() : val(0) {}
-  RAJA_HOST_DEVICE constexpr numeric_iterator(const difference_type& rhs)
-      : val(rhs)
+  RAJA_HOST_DEVICE constexpr numeric_iterator() {}
+  RAJA_HOST_DEVICE constexpr numeric_iterator(const numeric_iterator& rhs)
+      : val(rhs.val)
   {
   }
-  RAJA_HOST_DEVICE constexpr numeric_iterator(const numeric_iterator& rhs)
+  RAJA_HOST_DEVICE constexpr numeric_iterator(numeric_iterator&& rhs)
       : val(rhs.val)
   {
   }
+  RAJA_HOST_DEVICE numeric_iterator& operator=(const numeric_iterator& rhs)
+  {
+    val = rhs.val;
+    return *this;
+  }
+  RAJA_HOST_DEVICE numeric_iterator& operator=(numeric_iterator&& rhs)
+  {
+    val = rhs.val;
+    return *this;
+  }
+  RAJA_HOST_DEVICE constexpr numeric_iterator(const stripped_value_type& rhs)
+      : val(rhs)
+  {
+  }
 
   RAJA_HOST_DEVICE inline DifferenceType get_stride() const { return 1; }
 
@@ -109,12 +190,18 @@ class numeric_iterator
   RAJA_HOST_DEVICE inline numeric_iterator& operator+=(
       const difference_type& rhs)
   {
+#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG)
+    check_is_addition_overflow(val, rhs);
+#endif
     val += rhs;
     return *this;
   }
   RAJA_HOST_DEVICE inline numeric_iterator& operator-=(
       const difference_type& rhs)
   {
+#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG)
+    check_is_subtraction_overflow(val, rhs);
+#endif
     val -= rhs;
     return *this;
   }
@@ -131,12 +218,12 @@ class numeric_iterator
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline difference_type operator+(
+  RAJA_HOST_DEVICE inline stripped_value_type operator+(
       const numeric_iterator& rhs) const
   {
     return val + rhs.val;
   }
-  RAJA_HOST_DEVICE inline difference_type operator-(
+  RAJA_HOST_DEVICE inline stripped_value_type operator-(
       const numeric_iterator& rhs) const
   {
     return val - rhs.val;
@@ -144,24 +231,42 @@ class numeric_iterator
   RAJA_HOST_DEVICE inline numeric_iterator operator+(
       const difference_type& rhs) const
   {
+#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG)
+    check_is_addition_overflow(val, rhs);
+#endif
     return numeric_iterator(val + rhs);
   }
   RAJA_HOST_DEVICE inline numeric_iterator operator-(
       const difference_type& rhs) const
   {
+#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG)
+    check_is_subtraction_overflow(val, rhs);
+#endif
     return numeric_iterator(val - rhs);
   }
   RAJA_HOST_DEVICE friend constexpr numeric_iterator operator+(
       difference_type lhs,
       const numeric_iterator& rhs)
   {
+#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG)
+    return is_addition_overflow(rhs.val, lhs)
+               ? throw std::runtime_error(overflow_msg(lhs, rhs.val))
+               : numeric_iterator(lhs + rhs.val);
+#else
     return numeric_iterator(lhs + rhs.val);
+#endif
   }
   RAJA_HOST_DEVICE friend constexpr numeric_iterator operator-(
       difference_type lhs,
       const numeric_iterator& rhs)
   {
+#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG)
+    return is_subtraction_overflow(rhs.val, lhs, false)
+               ? throw std::runtime_error(overflow_msg(lhs, rhs.val))
+               : numeric_iterator(lhs - rhs.val);
+#else
     return numeric_iterator(lhs - rhs.val);
+#endif
   }
 
   RAJA_HOST_DEVICE inline value_type operator*() const
@@ -178,7 +283,7 @@ class numeric_iterator
   }
 
 private:
-  difference_type val;
+  stripped_value_type val = 0;
 };
 
 template <typename Type = Index_type,
@@ -188,23 +293,41 @@ class strided_numeric_iterator
 {
 public:
   using value_type = Type;
+  using stripped_value_type = strip_index_type_t<Type>;
   using difference_type = DifferenceType;
   using pointer = DifferenceType*;
   using reference = DifferenceType&;
   using iterator_category = std::random_access_iterator_tag;
 
-  RAJA_HOST_DEVICE constexpr strided_numeric_iterator() : val(0), stride(1) {}
-
+  RAJA_HOST_DEVICE constexpr strided_numeric_iterator() {}
   RAJA_HOST_DEVICE constexpr strided_numeric_iterator(
-      DifferenceType rhs,
-      DifferenceType stride_ = DifferenceType(1))
-      : val(rhs), stride(stride_)
+      const strided_numeric_iterator& rhs)
+      : val(rhs.val), stride(rhs.stride)
+  {
+  }
+  RAJA_HOST_DEVICE constexpr strided_numeric_iterator(strided_numeric_iterator&& rhs)
+      : val(rhs.val), stride(rhs.stride)
+  {
+  }
+  RAJA_HOST_DEVICE strided_numeric_iterator& operator=(
+      const strided_numeric_iterator& rhs)
+  {
+    val = rhs.val;
+    stride = rhs.stride;
+    return *this;
+  }
+  RAJA_HOST_DEVICE strided_numeric_iterator& operator=(
+      strided_numeric_iterator&& rhs)
   {
+    val = rhs.val;
+    stride = rhs.stride;
+    return *this;
   }
 
   RAJA_HOST_DEVICE constexpr strided_numeric_iterator(
-      const strided_numeric_iterator& rhs)
-      : val(rhs.val), stride(rhs.stride)
+      stripped_value_type rhs,
+      DifferenceType stride_ = DifferenceType(1))
+      : val(rhs), stride(stride_)
   {
   }
 
@@ -224,12 +347,18 @@ class strided_numeric_iterator
   RAJA_HOST_DEVICE inline strided_numeric_iterator& operator+=(
       const difference_type& rhs)
   {
+#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG)
+    check_is_addition_overflow(val, rhs * stride);
+#endif
     val += rhs * stride;
     return *this;
   }
   RAJA_HOST_DEVICE inline strided_numeric_iterator& operator-=(
       const difference_type& rhs)
   {
+#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG)
+    check_is_subtraction_overflow(val, rhs * stride);
+#endif
     val -= rhs * stride;
     return *this;
   }
@@ -254,11 +383,17 @@ class strided_numeric_iterator
   RAJA_HOST_DEVICE inline strided_numeric_iterator operator+(
       const difference_type& rhs) const
   {
+#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG)
+    check_is_addition_overflow(val, rhs * stride);
+#endif
     return strided_numeric_iterator(val + rhs * stride, stride);
   }
   RAJA_HOST_DEVICE inline strided_numeric_iterator operator-(
       const difference_type& rhs) const
   {
+#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG)
+    check_is_subtraction_overflow(val, rhs * stride);
+#endif
     return strided_numeric_iterator(val - rhs * stride, stride);
   }
 
@@ -311,8 +446,8 @@ class strided_numeric_iterator
   }
 
 private:
-  DifferenceType val;
-  DifferenceType stride;
+  stripped_value_type val = 0;
+  DifferenceType stride = 1;
 };
 
 
diff --git a/include/RAJA/internal/LegacyCompatibility.hpp b/include/RAJA/internal/LegacyCompatibility.hpp
deleted file mode 100644
index 73954eb734..0000000000
--- a/include/RAJA/internal/LegacyCompatibility.hpp
+++ /dev/null
@@ -1,315 +0,0 @@
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file with support for pre-C++14 compilers.
- *
- ******************************************************************************
- */
-
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#ifndef RAJA_LEGACY_COMPATIBILITY_HPP
-#define RAJA_LEGACY_COMPATIBILITY_HPP
-
-#include "RAJA/config.hpp"
-
-#include <cstdint>
-#include <functional>
-#include <iostream>
-#include <type_traits>
-#include <utility>
-
-#include "camp/camp.hpp"
-
-#include "RAJA/util/macros.hpp"
-
-#if (!defined(__INTEL_COMPILER)) && (!defined(RAJA_COMPILER_MSVC))
-static_assert(__cplusplus >= 201103L,
-              "C++ standards below 2011 are not "
-              "supported" RAJA_STRINGIFY_HELPER(__cplusplus));
-#endif
-
-#if __cplusplus > 201400L
-#define RAJA_CXX14_CONSTEXPR constexpr
-#else
-#define RAJA_CXX14_CONSTEXPR
-#endif
-
-// #if defined(RAJA_USE_CUDA)
-// #include <thrust/tuple.h>
-// namespace VarOps {
-//     using thrust::tuple;
-//     using thrust::tuple_element;
-//     using thrust::get;
-//     using thrust::tuple_size;
-//     using thrust::make_tuple;
-// }
-// #else
-#include <array>
-#include <tuple>
-namespace VarOps
-{
-using std::get;
-using std::make_tuple;
-using std::tuple;
-using std::tuple_cat;
-using std::tuple_element;
-using std::tuple_size;
-}  // namespace VarOps
-// #endif
-
-namespace VarOps
-{
-
-// Basics, using c++14 semantics in a c++11 compatible way, credit to libc++
-
-// Forward
-
-// FoldL
-template <typename Op, typename... Rest>
-struct foldl_impl;
-
-template <typename Op, typename Arg1>
-struct foldl_impl<Op, Arg1> {
-  using Ret = Arg1;
-};
-
-template <typename Op, typename Arg1, typename Arg2>
-struct foldl_impl<Op, Arg1, Arg2> {
-  using Ret = typename std::result_of<Op(Arg1, Arg2)>::type;
-};
-
-template <typename Op,
-          typename Arg1,
-          typename Arg2,
-          typename Arg3,
-          typename... Rest>
-struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
-  using Ret = typename foldl_impl<
-      Op,
-      typename std::result_of<Op(typename std::result_of<Op(Arg1, Arg2)>::type,
-                                 Arg3)>::type,
-      Rest...>::Ret;
-};
-
-template <typename Op, typename Arg1>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(
-    Op&& RAJA_UNUSED_ARG(operation),
-    Arg1&& arg) -> typename foldl_impl<Op, Arg1>::Ret
-{
-  return camp::forward<Arg1>(arg);
-}
-
-template <typename Op, typename Arg1, typename Arg2>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation,
-                                                  Arg1&& arg1,
-                                                  Arg2&& arg2) ->
-    typename foldl_impl<Op, Arg1, Arg2>::Ret
-{
-  return camp::forward<Op>(operation)(camp::forward<Arg1>(arg1),
-                                      camp::forward<Arg2>(arg2));
-}
-
-template <typename Op,
-          typename Arg1,
-          typename Arg2,
-          typename Arg3,
-          typename... Rest>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation,
-                                                  Arg1&& arg1,
-                                                  Arg2&& arg2,
-                                                  Arg3&& arg3,
-                                                  Rest&&... rest) ->
-    typename foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>::Ret
-{
-  return foldl(camp::forward<Op>(operation),
-               camp::forward<Op>(operation)(
-                   camp::forward<Op>(operation)(camp::forward<Arg1>(arg1),
-                                                camp::forward<Arg2>(arg2)),
-                   camp::forward<Arg3>(arg3)),
-               camp::forward<Rest>(rest)...);
-}
-
-
-// Convenience folds
-template <typename Result, typename... Args>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr Result sum(Args... args)
-{
-  return foldl(RAJA::operators::plus<Result>(), args...);
-}
-
-template <typename Result, typename... Args>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr Result max(Args... args)
-{
-  return foldl(RAJA::operators::maximum<Result>(), args...);
-}
-
-template <typename Result, typename... Args>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr Result min(Args... args)
-{
-  return foldl(RAJA::operators::minimum<Result>(), args...);
-}
-
-// template<typename Result, size_t N>
-// struct product_first_n;
-//
-// template<typename Result>
-// struct product_first_n<Result, 0>{
-//     static Result value = 1;
-//     template<typename ... Args>
-//     constexpr product_first_n(Args...args) : value{1} { }
-// };
-//
-// template<typename Result, size_t N>
-// struct product_first_n{
-//     static Result value = product_first_n<Result, N-1>(args...)::value;
-//     template<typename FirstArg, typename ... Args>
-//     constexpr product_first_n(FirstArg arg1, Args...args)
-//     : value() { }
-// };
-
-template <template <class...> class Seq, class First, class... Ints>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto rotate_left_one(
-    const Seq<First, Ints...>) -> Seq<Ints..., First>
-{
-  return Seq<Ints..., First>{};
-}
-
-
-// Index sequence
-template <size_t Upper>
-using make_index_sequence = typename camp::make_int_seq<size_t, Upper>::type;
-
-template <size_t... Ints>
-using index_sequence = camp::int_seq<size_t, Ints...>;
-
-// Invoke
-
-template <typename Fn, size_t... Sequence, typename TupleLike>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto invoke_with_order(
-    TupleLike&& t,
-    Fn&& f,
-    index_sequence<Sequence...>) -> decltype(f(get<Sequence>(t)...))
-{
-  return f(get<Sequence>(t)...);
-}
-
-template <typename Fn, typename TupleLike>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto invoke(TupleLike&& t, Fn&& f)
-    -> decltype(
-        invoke_with_order(t,
-                          f,
-                          make_index_sequence<tuple_size<TupleLike>::value>{}))
-{
-  return invoke_with_order(t,
-                           f,
-                           make_index_sequence<tuple_size<TupleLike>::value>{});
-}
-
-// Ignore helper
-template <typename... Args>
-RAJA_HOST_DEVICE RAJA_INLINE void ignore_args(Args...)
-{
-}
-
-// Assign
-
-template <size_t... To, size_t... From, typename ToT, typename FromT>
-RAJA_HOST_DEVICE RAJA_INLINE void assign(ToT&& dst,
-                                         FromT src,
-                                         index_sequence<To...>,
-                                         index_sequence<From...>)
-{
-  ignore_args((dst[To] = src[From])...);
-}
-
-template <size_t... To, typename ToT, typename... Args>
-RAJA_HOST_DEVICE RAJA_INLINE void assign_args(ToT&& dst,
-                                              index_sequence<To...>,
-                                              Args... args)
-{
-  ignore_args((dst[To] = args)...);
-}
-
-// Get nth element of parameter pack
-template <size_t index, size_t first, size_t... rest>
-struct get_at {
-  static constexpr size_t value = get_at<index - 1, rest...>::value;
-};
-
-template <size_t first, size_t... rest>
-struct get_at<0, first, rest...> {
-  static constexpr size_t value = first;
-};
-
-// Get nth element of parameter pack
-template <size_t index, typename first, typename... rest>
-struct get_type_at {
-  using type = typename get_type_at<index - 1, rest...>::type;
-};
-
-template <typename first, typename... rest>
-struct get_type_at<0, first, rest...> {
-  using type = first;
-};
-
-// Get offset of element of parameter pack
-template <size_t diff, size_t off, size_t match, size_t... rest>
-struct get_offset_impl {
-  static constexpr size_t value =
-      get_offset_impl<match - get_at<off + 1, rest...>::value,
-                      off + 1,
-                      match,
-                      rest...>::value;
-};
-
-template <size_t off, size_t match, size_t... rest>
-struct get_offset_impl<0, off, match, rest...> {
-  static constexpr size_t value = off;
-};
-
-template <size_t match, size_t first, size_t... rest>
-struct get_offset
-    : public get_offset_impl<match - first, 0, match, first, rest...> {
-};
-
-// Get nth element of argument list
-// TODO: add specializations to make this compile faster and with less
-// recursion
-template <size_t index>
-struct get_arg_at {
-  template <typename First, typename... Rest>
-  RAJA_HOST_DEVICE RAJA_INLINE static constexpr auto value(
-      First&& RAJA_UNUSED_ARG(first),
-      Rest&&... rest)
-      -> decltype(
-          camp::forward<typename VarOps::get_type_at<index - 1, Rest...>::type>(
-              get_arg_at<index - 1>::value(camp::forward<Rest>(rest)...)))
-  {
-    static_assert(index < sizeof...(Rest) + 1, "index is past the end");
-    return camp::forward<
-        typename VarOps::get_type_at<index - 1, Rest...>::type>(
-        get_arg_at<index - 1>::value(camp::forward<Rest>(rest)...));
-  }
-};
-
-template <>
-struct get_arg_at<0> {
-  template <typename First, typename... Rest>
-  RAJA_HOST_DEVICE RAJA_INLINE static constexpr auto value(
-      First&& first,
-      Rest&&... RAJA_UNUSED_ARG(rest)) -> decltype(camp::forward<First>(first))
-  {
-    return camp::forward<First>(first);
-  }
-};
-}  // namespace VarOps
-
-#endif
diff --git a/include/RAJA/internal/MemUtils_CPU.hpp b/include/RAJA/internal/MemUtils_CPU.hpp
index b05fabe02b..0e33c3cc8b 100644
--- a/include/RAJA/internal/MemUtils_CPU.hpp
+++ b/include/RAJA/internal/MemUtils_CPU.hpp
@@ -92,6 +92,35 @@ inline void free_aligned(void* ptr)
 #endif
 }
 
+///
+/// Deleter function object for memory allocated with allocate_aligned
+///
+struct FreeAligned
+{
+  void operator()(void* ptr)
+  {
+    free_aligned(ptr);
+  }
+};
+
+///
+/// Deleter function object for memory allocated with allocate_aligned_type
+/// that calls the destructor for the fist size objects in the storage.
+///
+template < typename T, typename index_type >
+struct FreeAlignedType : FreeAligned
+{
+  index_type size = 0;
+
+  void operator()(T* ptr)
+  {
+    for ( index_type i = size; i > 0; --i ) {
+      ptr[i-1].~T();
+    }
+    FreeAligned::operator()(ptr);
+  }
+};
+
 }  // namespace RAJA
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/internal/RAJAVec.hpp b/include/RAJA/internal/RAJAVec.hpp
index 93face581c..f9199321e0 100644
--- a/include/RAJA/internal/RAJAVec.hpp
+++ b/include/RAJA/internal/RAJAVec.hpp
@@ -21,6 +21,7 @@
 
 #include "RAJA/config.hpp"
 
+#include <cstddef>
 #include <memory>
 #include <utility>
 
@@ -43,55 +44,87 @@ namespace RAJA
  *               Template type should support standard semantics for
  *               copy, swap, etc.
  *
+ *               Note that this class has no exception safety guarantees.
+ *
  ******************************************************************************
  */
-template <typename T, typename allocator_type = std::allocator<T> >
+template <typename T, typename Allocator = std::allocator<T> >
 class RAJAVec
 {
+  using allocator_traits_type = std::allocator_traits<Allocator>;
+  using propagate_on_container_copy_assignment =
+      typename allocator_traits_type::propagate_on_container_copy_assignment;
+  using propagate_on_container_move_assignment =
+      typename allocator_traits_type::propagate_on_container_move_assignment;
+  using propagate_on_container_swap            =
+      typename allocator_traits_type::propagate_on_container_swap;
 public:
-  using iterator = T*;
+  using value_type = T;
+  using allocator_type = Allocator;
+  using size_type = std::size_t;
+  using difference_type = std::ptrdiff_t;
+  using reference = value_type&;
+  using const_reference = const value_type&;
+  using pointer = typename allocator_traits_type::pointer;
+  using const_pointer = typename allocator_traits_type::const_pointer;
+  using iterator = value_type*;
+  using const_iterator = const value_type*;
 
   ///
   /// Construct empty vector with given capacity.
   ///
-  explicit RAJAVec(size_t init_cap = 0,
+  explicit RAJAVec(size_type init_cap = 0,
                    const allocator_type& a = allocator_type())
       : m_data(nullptr), m_allocator(a), m_capacity(0), m_size(0)
   {
-    grow_cap(init_cap);
+    reserve(init_cap);
   }
 
   ///
   /// Copy ctor for vector.
   ///
-  RAJAVec(const RAJAVec<T>& other)
+  RAJAVec(const RAJAVec& other)
       : m_data(nullptr),
-        m_allocator(other.m_allocator),
+        m_allocator(allocator_traits_type::select_on_container_copy_construction(other.m_allocator)),
         m_capacity(0),
         m_size(0)
   {
-    copy(other);
+    reserve(other.size());
+    copy_construct_items_back(other.size(), other.data());
   }
 
   ///
-  /// Swap function for copy-and-swap idiom.
+  /// Move ctor for vector.
   ///
-  void swap(RAJAVec<T>& other)
+  RAJAVec(RAJAVec&& other)
+      : m_data(other.m_data),
+        m_allocator(std::move(other.m_allocator)),
+        m_capacity(other.m_capacity),
+        m_size(other.m_size)
   {
-    using std::swap;
-    swap(m_capacity, other.m_capacity);
-    swap(m_size, other.m_size);
-    swap(m_data, other.m_data);
+    other.m_data = nullptr;
+    other.m_capacity = 0;
+    other.m_size = 0;
   }
 
   ///
   /// Copy-assignment operator for vector.
   ///
-  RAJAVec<T>& operator=(const RAJAVec<T>& rhs)
+  RAJAVec& operator=(const RAJAVec& rhs)
+  {
+    if (&rhs != this) {
+      copy_assign_private(rhs, propagate_on_container_copy_assignment{});
+    }
+    return *this;
+  }
+
+  ///
+  /// Move-assignment operator for vector.
+  ///
+  RAJAVec& operator=(RAJAVec&& rhs)
   {
     if (&rhs != this) {
-      RAJAVec<T> copy(rhs);
-      this->swap(copy);
+      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
     }
     return *this;
   }
@@ -101,23 +134,42 @@ class RAJAVec
   ///
   ~RAJAVec()
   {
-    if (m_capacity > 0) m_allocator.deallocate(m_data, m_capacity);
+    clear();
+    shrink_to_fit();
+  }
+
+  ///
+  /// Swap function for copy-and-swap idiom.
+  ///
+  void swap(RAJAVec& other)
+  {
+    swap_private(other, propagate_on_container_swap{});
   }
 
   ///
   /// Get a pointer to the beginning of the contiguous vector
   ///
-  T* data() const { return m_data; }
+        pointer data()       { return m_data; }
+  ///
+  const_pointer data() const { return m_data; }
 
   ///
   /// Get an iterator to the end.
   ///
-  iterator end() const { return m_data + m_size; }
+        iterator  end()       { return m_data + m_size; }
+  ///
+  const_iterator  end() const { return m_data + m_size; }
+  ///
+  const_iterator cend() const { return m_data + m_size; }
 
   ///
   /// Get an iterator to the beginning.
   ///
-  iterator begin() const { return m_data; }
+        iterator  begin()       { return m_data; }
+  ///
+  const_iterator  begin() const { return m_data; }
+  ///
+  const_iterator cbegin() const { return m_data; }
 
   ///
   /// Return true if vector has size zero; false otherwise.
@@ -127,125 +179,383 @@ class RAJAVec
   ///
   /// Return current size of vector.
   ///
-  size_t size() const { return m_size; }
+  size_type size() const { return m_size; }
 
-  RAJA_INLINE
-  void resize(size_t new_size)
+  ///
+  /// Return current capacity of vector.
+  ///
+  size_type capacity() const { return m_capacity; }
+
+  ///
+  /// Get the allocator used by the container.
+  ///
+  allocator_type get_allocator() const { return m_allocator; }
+
+  ///
+  /// Grow the capacity of the vector.
+  ///
+  void reserve(size_type target_capacity) { grow_cap(target_capacity); }
+
+  ///
+  /// Shrink the capacity of the vector to the current size.
+  ///
+  void shrink_to_fit()
   {
-    grow_cap(new_size);
-    m_size = new_size;
+    shrink_cap(m_size);
   }
 
-  RAJA_INLINE
-  void resize(size_t new_size, T const& new_value)
+  ///
+  /// Empty vector of all data.
+  ///
+  void clear()
   {
-    grow_cap(new_size);
+    destroy_items_after(0);
+  }
 
-    if (new_size > m_size) {
-      for (size_t i = m_size; i < new_size; ++i) {
-        m_data[i] = new_value;
-      }
+  ///
+  /// Change the size of the vector,
+  /// default initializing any new items,
+  /// destroying any extra items.
+  ///
+  RAJA_INLINE
+  void resize(size_type new_size)
+  {
+    if (new_size >= size()) {
+      reserve(new_size);
+      construct_items_back(new_size);
+    } else {
+      destroy_items_after(new_size);
     }
+  }
 
-    m_size = new_size;
+  ///
+  /// Change the size of the vector,
+  /// initializing any new items with new_value,
+  /// destroying any extra items.
+  ///
+  RAJA_INLINE
+  void resize(size_type new_size, const_reference new_value)
+  {
+    if (new_size >= size()) {
+      reserve(new_size);
+      construct_items_back(new_size, new_value);
+    } else {
+      destroy_items_after(new_size);
+    }
   }
 
   ///
-  /// Const bracket operator.
+  /// Bracket operator accessor.
+  ///
+        reference operator[](difference_type i)       { return m_data[i]; }
   ///
-  const T& operator[](size_t i) const { return m_data[i]; }
+  const_reference operator[](difference_type i) const { return m_data[i]; }
 
   ///
-  /// Non-const bracket operator.
+  /// Access the last item of the vector.
+  ///
+        reference front()       { return m_data[0]; }
   ///
-  T& operator[](size_t i) { return m_data[i]; }
+  const_reference front() const { return m_data[0]; }
 
   ///
-  /// Add item to back end of vector.
+  /// Access the last item of the vector.
+  ///
+        reference back()       { return m_data[m_size-1]; }
   ///
-  void push_back(const T& item) { push_back_private(item); }
+  const_reference back() const { return m_data[m_size-1]; }
 
   ///
   /// Add item to front end of vector. Note that this operation is unique to
   /// this class; it is not part of the C++ standard library vector interface.
   ///
-  void push_front(const T& item) { push_front_private(item); }
+  void push_front(const_reference item) { emplace_front_private(item); }
+  ///
+  void push_front(   value_type&& item) { emplace_front_private(std::move(item)); }
+  ///
+  template < typename ... Os >
+  void emplace_front(Os&&... os) { emplace_front_private(std::forward<Os>(os)...); }
+
+  ///
+  /// Add item to back end of vector.
+  ///
+  void push_back(const_reference item) { emplace_back_private(item); }
+  ///
+  void push_back(   value_type&& item) { emplace_back_private(std::move(item)); }
+  ///
+  template < typename ... Os >
+  void emplace_back(Os&&... os) { emplace_back_private(std::forward<Os>(os)...); }
+
+  ///
+  /// Remove the last item of the vector.
+  ///
+  void pop_back()
+  {
+    destroy_items_after(m_size-1);
+  }
 
 private:
+  pointer m_data;
+  allocator_type m_allocator;
+  size_type m_capacity;
+  size_type m_size;
+
+  ///
+  /// Copy assignment implementation
+  /// when propagate on container copy assignment is true.
+  ///
+  void copy_assign_private(RAJAVec const& rhs, std::true_type)
+  {
+    if (m_allocator != rhs.m_allocator) {
+      clear();
+      shrink_to_fit();
+      m_allocator = rhs.m_allocator;
+    }
+
+    copy_assign_private(rhs, std::false_type{});
+  }
+
+  ///
+  /// Copy assignment implementation
+  /// when propagate on container copy assignment is false.
+  ///
+  void copy_assign_private(RAJAVec const& rhs, std::false_type)
+  {
+    reserve(rhs.size());
+    if (size() < rhs.size()) {
+      copy_assign_items(0, size(), rhs.data());
+      copy_construct_items_back(rhs.size(), rhs.data());
+    } else {
+      copy_assign_items(0, rhs.size(), rhs.data());
+      destroy_items_after(size());
+    }
+  }
+
+  ///
+  /// Move assignment implementation
+  /// when propagate on container copy assignment is true.
+  ///
+  void move_assign_private(RAJAVec&& rhs, std::true_type)
+  {
+    clear();
+    shrink_to_fit();
+
+    m_data = rhs.m_data;
+    m_allocator = std::move(rhs.m_allocator);
+    m_capacity = rhs.m_capacity;
+    m_size = rhs.m_size;
+
+    rhs.m_data = nullptr;
+    rhs.m_capacity = 0;
+    rhs.m_size = 0;
+  }
+
+  ///
+  /// Move assignment implementation
+  /// when propagate on container copy assignment is false.
+  ///
+  void move_assign_private(RAJAVec&& rhs, std::false_type)
+  {
+    if (m_allocator == rhs.m_allocator) {
+      clear();
+      shrink_to_fit();
+
+      m_data = rhs.m_data;
+      m_capacity = rhs.m_capacity;
+      m_size = rhs.m_size;
+
+      rhs.m_data = nullptr;
+      rhs.m_capacity = 0;
+      rhs.m_size = 0;
+    } else {
+      reserve(rhs.size());
+      if (size() < rhs.size()) {
+        move_assign_items(0, size(), rhs.data());
+        move_construct_items_back(rhs.size(), rhs.data());
+      } else {
+        move_assign_items(0, rhs.size(), rhs.data());
+        destroy_items_after(size());
+      }
+    }
+  }
+
+  ///
+  /// Swap implementation when propagate on swap is true.
+  ///
+  void swap_private(RAJAVec& other, std::true_type)
+  {
+    using std::swap;
+    swap(m_data,      other.m_data);
+    swap(m_allocator, other.m_allocator);
+    swap(m_capacity,  other.m_capacity);
+    swap(m_size,      other.m_size);
+  }
+
+  ///
+  /// Swap implementation when propagate on swap is false.
+  ///
+  void swap_private(RAJAVec& other, std::false_type)
+  {
+    using std::swap;
+    swap(m_data,      other.m_data);
+    swap(m_capacity,  other.m_capacity);
+    swap(m_size,      other.m_size);
+  }
+
   //
-  // Copy function for copy-and-swap idiom (deep copy).
+  // Copy items [first, last) from o_data.
   //
-  void copy(const RAJAVec<T>& other)
+  void copy_assign_items(size_type first, size_type last, const_pointer o_data)
   {
-    grow_cap(other.m_capacity);
-    for (size_t i = 0; i < other.m_size; ++i) {
-      m_data[i] = other[i];
+    for (size_type i = first; i < last; ++i) {
+      m_data[i] = o_data[i];
     }
-    m_capacity = other.m_capacity;
-    m_size = other.m_size;
   }
 
   //
-  // The following private members and methods provide a quick and dirty
-  // memory allocation scheme to mimick std::vector behavior without
-  // relying on STL directly.  These are initialized at the end of this file.
+  // Move items [first, last) from o_data.
   //
-  static constexpr const size_t s_init_cap = 8;
-  static constexpr const double s_grow_fac = 1.5;
+  void move_assign_items(size_type first, size_type last, pointer o_data)
+  {
+    for (size_type i = first; i < last; ++i) {
+      m_data[i] = std::move(o_data[i]);
+    }
+  }
 
-  size_t nextCap(size_t current_cap)
+  //
+  // Construct items [m_size, new_size) from args.
+  //
+  template < typename ... Os >
+  void construct_items_back(size_type new_size, Os&&... os)
   {
-    if (current_cap == 0) {
-      return s_init_cap;
+    for (; m_size < new_size; ++m_size) {
+      allocator_traits_type::construct(m_allocator, m_data+m_size, std::forward<Os>(os)...);
     }
-    return static_cast<size_t>(current_cap * s_grow_fac);
   }
 
-  void grow_cap(size_t target_size)
+  //
+  // Copy construct items [m_size, new_size) from o_data.
+  //
+  void copy_construct_items_back(size_type new_size, const_pointer o_data)
   {
-    size_t target_cap = m_capacity;
-    while (target_cap < target_size) {
-      target_cap = nextCap(target_cap);
+    for (; m_size < new_size; ++m_size) {
+      allocator_traits_type::construct(m_allocator, m_data+m_size, o_data[m_size]);
     }
+  }
 
-    if (m_capacity < target_cap) {
-      T* tdata = m_allocator.allocate(target_cap);
+  //
+  // Move construct items [m_size, new_size) from o_data.
+  //
+  void move_construct_items_back(size_type new_size, pointer o_data)
+  {
+    for (; m_size < new_size; ++m_size) {
+      allocator_traits_type::construct(m_allocator, m_data+m_size, std::move(o_data[m_size]));
+    }
+  }
 
-      if (m_data) {
-        for (size_t i = 0; (i < m_size) && (i < target_cap); ++i) {
-          tdata[i] = m_data[i];
-        }
-        m_allocator.deallocate(m_data, m_capacity);
-      }
+  //
+  // Destroy items [new_end, m_size).
+  //
+  void destroy_items_after(size_type new_end)
+  {
+    for (; m_size > new_end; --m_size) {
+      allocator_traits_type::destroy(m_allocator, m_data+m_size-1);
+    }
+  }
 
-      m_data = tdata;
-      m_capacity = target_cap;
+  //
+  // Add an item to the front, shifting all existing items back one.
+  //
+  template < typename ... Os >
+  void emplace_front_private(Os&&... os)
+  {
+    reserve(m_size + 1);
+
+    if (m_size > 0) {
+      size_type i = m_size;
+      allocator_traits_type::construct(m_allocator, m_data+i, std::move(m_data[i - 1]));
+      for (--i; i > 0; --i) {
+        m_data[i] = std::move(m_data[i - 1]);
+      }
+      allocator_traits_type::destroy(m_allocator, m_data);
     }
+    allocator_traits_type::construct(m_allocator, m_data, std::forward<Os>(os)...);
+    m_size++;
   }
 
-  void push_back_private(const T& item)
+  //
+  // Add an item to the back.
+  //
+  template < typename ... Os >
+  void emplace_back_private(Os&&... os)
   {
-    grow_cap(m_size + 1);
-    m_data[m_size] = item;
+    reserve(m_size + 1);
+    allocator_traits_type::construct(m_allocator, m_data+m_size, std::forward<Os>(os)...);
     m_size++;
   }
 
-  void push_front_private(const T& item)
+  //
+  // The following private members and methods provide a quick and dirty
+  // memory allocation scheme to mimick std::vector behavior without
+  // relying on STL directly.
+  //
+  static constexpr const size_type s_init_cap = 8;
+  static constexpr const double s_grow_fac = 1.5;
+
+  //
+  // Get the next value for capacity given a target and minimum.
+  //
+  size_type get_next_cap(size_type target_size)
   {
-    size_t old_size = m_size;
-    grow_cap(old_size + 1);
+    size_type next_cap = s_init_cap;
+    if (m_capacity != 0) {
+      next_cap = static_cast<size_type>(m_capacity * s_grow_fac);
+    }
+    return std::max(target_size, next_cap);
+  }
 
-    for (size_t i = old_size; i > 0; --i) {
-      m_data[i] = m_data[i - 1];
+  //
+  // Increase capacity to at least target_size.
+  //
+  void grow_cap(size_type target_size)
+  {
+    if (m_capacity < target_size) {
+      change_cap(get_next_cap(target_size));
     }
-    m_data[0] = item;
-    m_size++;
   }
 
-  T* m_data;
-  allocator_type m_allocator;
-  size_t m_capacity;
-  size_t m_size;
+  //
+  // Decrease capacity to at most target_size or size if size is greater.
+  //
+  void shrink_cap(size_type target_size)
+  {
+    if (m_capacity > target_size) {
+      change_cap(std::max(m_size, target_size));
+    }
+  }
+
+  //
+  // Reallocate to change capacity to next_cap.
+  // NOTE: assumes next_cap >= size()
+  //
+  void change_cap(size_type next_cap)
+  {
+    pointer tdata = nullptr;
+    if (next_cap != 0) {
+      tdata = allocator_traits_type::allocate(m_allocator, next_cap);
+    }
+
+    if (m_data) {
+      for (size_type i = 0; i < m_size; ++i) {
+        allocator_traits_type::construct(m_allocator, tdata+i, std::move(m_data[i]));
+        allocator_traits_type::destroy(m_allocator, m_data+i);
+      }
+      allocator_traits_type::deallocate(m_allocator, m_data, m_capacity);
+    }
+
+    m_data = tdata;
+    m_capacity = next_cap;
+  }
 };
 
 }  // namespace RAJA
diff --git a/include/RAJA/internal/Span.hpp b/include/RAJA/internal/Span.hpp
deleted file mode 100644
index c36eb070e6..0000000000
--- a/include/RAJA/internal/Span.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file for RAJA span constructs.
- *
- ******************************************************************************
- */
-
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#ifndef RAJA_SPAN_HPP
-#define RAJA_SPAN_HPP
-
-#include <type_traits>
-
-#include "RAJA/util/concepts.hpp"
-#include "RAJA/util/macros.hpp"
-
-namespace RAJA
-{
-namespace impl
-{
-
-template <typename T>
-struct rm_ptr {
-  using type = T;
-};
-
-template <typename T>
-struct rm_ptr<T*> {
-  using type = T;
-};
-
-template <typename ValueType, typename IndexType>
-struct Span {
-  using value_type =
-      camp::decay<typename std::iterator_traits<ValueType>::value_type>;
-  using reference = value_type&;
-  using iterator = ValueType;
-  using const_iterator = ValueType const;
-  using difference_type = std::ptrdiff_t;
-  using size_type = std::size_t;
-
-  static_assert(type_traits::is_integral<IndexType>::value,
-                "IndexType must model Integral");
-  static_assert(type_traits::is_random_access_iterator<ValueType>::value,
-                "ValueType must model RandomAccessIterator");
-
-  RAJA_HOST_DEVICE Span(iterator begin, iterator end)
-      : m_begin{begin}, m_end{end}
-  {
-  }
-
-  RAJA_HOST_DEVICE Span(iterator begin, IndexType size)
-      : m_begin{begin}, m_end{begin + size}
-  {
-  }
-
-  RAJA_HOST_DEVICE RAJA_INLINE iterator begin() { return m_begin; }
-  RAJA_HOST_DEVICE RAJA_INLINE iterator end() { return m_end; }
-  RAJA_HOST_DEVICE RAJA_INLINE const_iterator begin() const { return m_begin; }
-  RAJA_HOST_DEVICE RAJA_INLINE const_iterator end() const { return m_end; }
-  RAJA_HOST_DEVICE RAJA_INLINE const_iterator cbegin() const { return m_begin; }
-  RAJA_HOST_DEVICE RAJA_INLINE const_iterator cend() const { return m_end; }
-
-  RAJA_HOST_DEVICE RAJA_INLINE ValueType data() const { return m_begin; }
-  RAJA_HOST_DEVICE RAJA_INLINE IndexType size() const
-  {
-    return static_cast<IndexType>(m_end - m_begin);
-  }
-  RAJA_HOST_DEVICE RAJA_INLINE IndexType max_size() const
-  {
-    return static_cast<IndexType>(m_end - m_begin);
-  }
-  RAJA_HOST_DEVICE RAJA_INLINE bool empty() const
-  {
-    return static_cast<IndexType>(m_end - m_begin) == 0;
-  }
-
-  // returns a span wrapper starting at begin with length ``length``
-  RAJA_HOST_DEVICE RAJA_INLINE Span slice(size_type begin,
-                                          size_type length) const
-  {
-    auto start = m_begin + begin;
-    auto end = start + length > m_end ? m_end : start + length;
-    return Span(start, end);
-  }
-  iterator m_begin;
-  iterator m_end;
-};
-
-template <typename ValueType, typename IndexType>
-RAJA_HOST_DEVICE RAJA_INLINE Span<ValueType, IndexType> make_span(
-    ValueType begin,
-    IndexType size)
-{
-  return Span<ValueType, IndexType>(begin, begin + size);
-}
-
-}  // end namespace impl
-}  // end namespace RAJA
-
-#endif /* RAJA_SPAN_HPP */
diff --git a/include/RAJA/internal/foldl.hpp b/include/RAJA/internal/foldl.hpp
new file mode 100644
index 0000000000..6e0c404d1c
--- /dev/null
+++ b/include/RAJA/internal/foldl.hpp
@@ -0,0 +1,138 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file with support for pre-C++14 compilers.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_foldl_HPP
+#define RAJA_foldl_HPP
+
+#include "RAJA/config.hpp"
+
+#include <cstdint>
+#include <functional>
+#include <iostream>
+#include <type_traits>
+#include <utility>
+
+#include "camp/camp.hpp"
+
+#include "RAJA/util/macros.hpp"
+
+
+namespace RAJA
+{
+
+// Basics, using c++14 semantics in a c++11 compatible way, credit to libc++
+
+// Forward
+namespace detail
+{
+// FoldL
+template <typename Op, typename... Rest>
+struct foldl_impl;
+
+template <typename Op, typename Arg1>
+struct foldl_impl<Op, Arg1> {
+  using Ret = Arg1;
+};
+
+template <typename Op, typename Arg1, typename Arg2>
+struct foldl_impl<Op, Arg1, Arg2> {
+  using Ret = typename std::result_of<Op(Arg1, Arg2)>::type;
+};
+
+template <typename Op,
+          typename Arg1,
+          typename Arg2,
+          typename Arg3,
+          typename... Rest>
+struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
+  using Ret = typename foldl_impl<
+      Op,
+      typename std::result_of<Op(typename std::result_of<Op(Arg1, Arg2)>::type,
+                                 Arg3)>::type,
+      Rest...>::Ret;
+};
+
+} // namespace detail
+
+template <typename Op, typename Arg1>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(
+    Op&& RAJA_UNUSED_ARG(operation),
+    Arg1&& arg) -> typename detail::foldl_impl<Op, Arg1>::Ret
+{
+  return camp::forward<Arg1>(arg);
+}
+
+template <typename Op, typename Arg1, typename Arg2>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation,
+                                                  Arg1&& arg1,
+                                                  Arg2&& arg2) ->
+    typename detail::foldl_impl<Op, Arg1, Arg2>::Ret
+{
+  return camp::forward<Op>(operation)(camp::forward<Arg1>(arg1),
+                                      camp::forward<Arg2>(arg2));
+}
+
+template <typename Op,
+          typename Arg1,
+          typename Arg2,
+          typename Arg3,
+          typename... Rest>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation,
+                                                  Arg1&& arg1,
+                                                  Arg2&& arg2,
+                                                  Arg3&& arg3,
+                                                  Rest&&... rest) ->
+    typename detail::foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>::Ret
+{
+  return foldl(camp::forward<Op>(operation),
+               camp::forward<Op>(operation)(
+                   camp::forward<Op>(operation)(camp::forward<Arg1>(arg1),
+                                                camp::forward<Arg2>(arg2)),
+                   camp::forward<Arg3>(arg3)),
+               camp::forward<Rest>(rest)...);
+}
+
+
+// Convenience folds
+template <typename Result, typename... Args>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr Result sum(Args... args)
+{
+  return foldl(RAJA::operators::plus<Result>(), args...);
+}
+
+template <typename Result, typename... Args>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr Result product(Args... args)
+{
+  return foldl(RAJA::operators::multiplies<Result>(), args...);
+}
+
+template <typename Result, typename... Args>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr Result max(Args... args)
+{
+  return foldl(RAJA::operators::maximum<Result>(), args...);
+}
+
+template <typename Result, typename... Args>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr Result min(Args... args)
+{
+  return foldl(RAJA::operators::minimum<Result>(), args...);
+}
+
+
+}  // namespace RAJA
+
+#endif
diff --git a/include/RAJA/internal/get_platform.hpp b/include/RAJA/internal/get_platform.hpp
index 5ef1cb342a..0354d04bfd 100644
--- a/include/RAJA/internal/get_platform.hpp
+++ b/include/RAJA/internal/get_platform.hpp
@@ -2,7 +2,7 @@
 #define RAJA_get_platform_HPP
 
 #include "RAJA/util/Operators.hpp"
-#include "RAJA/internal/LegacyCompatibility.hpp"
+#include "RAJA/internal/foldl.hpp"
 #include "RAJA/pattern/kernel/internal.hpp"
 
 namespace RAJA
@@ -47,7 +47,7 @@ struct get_platform {
 template <typename... Policies>
 struct get_platform_from_list {
   static constexpr Platform value =
-      VarOps::foldl(max_platform(), get_platform<Policies>::value...);
+      foldl(max_platform(), get_platform<Policies>::value...);
 };
 
 /*!
@@ -104,7 +104,7 @@ struct get_statement_platform {
 template <typename... Stmts>
 struct get_platform<RAJA::internal::StatementList<Stmts...>> {
   static constexpr Platform value =
-      VarOps::foldl(max_platform(), get_statement_platform<Stmts>::value...);
+      foldl(max_platform(), get_statement_platform<Stmts>::value...);
 };
 
 /*!
diff --git a/include/RAJA/module.private.modulemap b/include/RAJA/module.private.modulemap
index 0d4151c015..7960bd47e6 100644
--- a/include/RAJA/module.private.modulemap
+++ b/include/RAJA/module.private.modulemap
@@ -2,10 +2,9 @@ explicit module RAJA.internal {
   header "internal/DepGraphNode.hpp"
   header "internal/fault_tolerance.hpp"
   header "internal/Iterators.hpp"
-  header "internal/LegacyCompatibility.hpp"
+  header "internal/foldl.hpp"
   header "internal/MemUtils_CPU.hpp"
   header "internal/RAJAVec.hpp"
-  header "internal/Span.hpp"
   header "internal/ThreadUtils_CPU.hpp"
   header "util/Timer.hpp"
 }
diff --git a/include/RAJA/pattern/WorkGroup.hpp b/include/RAJA/pattern/WorkGroup.hpp
new file mode 100644
index 0000000000..b82e9cd5fb
--- /dev/null
+++ b/include/RAJA/pattern/WorkGroup.hpp
@@ -0,0 +1,467 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file providing RAJA WorkPool and WorkGroup declarations.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_PATTERN_WorkGroup_HPP
+#define RAJA_PATTERN_WorkGroup_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/pattern/WorkGroup/WorkStorage.hpp"
+#include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
+
+#include "RAJA/internal/get_platform.hpp"
+#include "RAJA/util/plugins.hpp"
+
+namespace RAJA
+{
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  xargs alias.
+ *
+ * Usage example:
+ *
+ * \verbatim
+
+   WorkPool<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> pool(allocator);
+
+   pool.enqueue(..., [=] (Index_type i, int* xarg0, int xarg1) {
+      xarg0[i] = xarg1;
+   });
+
+   WorkGroup<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> group = pool.instantiate();
+
+   int* xarg0 = ...;
+   int xarg1 = ...;
+   WorkSite<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> site = group.run(xarg0, xarg1);
+
+ * \endverbatim
+ *
+ ******************************************************************************
+ */
+template < typename ... Args >
+using xargs = camp::list<Args...>;
+
+namespace detail {
+
+template < typename T >
+struct is_xargs {
+  static constexpr bool value = false;
+};
+
+template < typename ... Args >
+struct is_xargs<xargs<Args...>> {
+  static constexpr bool value = true;
+};
+
+}
+
+
+//
+// Forward declarations for WorkPool and WorkGroup templates.
+// Actual classes appear in forall_*.hxx header files.
+//
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  WorkPool class template.
+ *
+ * The WorkPool object is the first member of the workgroup constructs. It
+ * takes loops via enqueue and stores the loops so the loops can be run later.
+ * The WorkPool creates a WorkGroup object with the enqueued collection of
+ * loops via instantiate. The WorkPool can then be reused to enqueue more loops.
+ * The WorkPool attempts to optimize storage usage by remembering the max number
+ * of loops and the max storage size previously used and automatically reserving
+ * that amount when it is reused.
+ *
+ * Usage example:
+ *
+ * \verbatim
+
+   WorkPool<WorkGroup_policy, Index_type, xargs<>, Allocator> pool(allocator);
+
+   Real_ptr data = ...;
+
+   pool.enqueue( ..., [=] (Index_type i) {
+      data[i] = 1;
+   });
+
+   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group = pool.instantiate();
+
+ * \endverbatim
+ *
+ ******************************************************************************
+ */
+template <typename WORKGROUP_POLICY_T,
+          typename INDEX_T,
+          typename EXTRA_ARGS_T,
+          typename ALLOCATOR_T>
+struct WorkPool {
+  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+      "WorkPool: WORKGROUP_POLICY_T must be a workgroup policy");
+  static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
+      "WorkPool: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
+};
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  WorkGroup class template. Owns loops from an instantiated WorkPool.
+ *
+ * The WorkGroup object is the second member of the workgroup constructs. It
+ * is created by a WorkPool and stores a collection of loops so they can be
+ * run. When the WorkGroup is run it creates a WorkSite object with any per run
+ * data. Because the WorkGroup owns a collection of loops it must not be
+ * destroyed before that collection of loops has finished running. The
+ * WorkGroup can be used to run its collection of loops multiple times.
+ *
+ * Usage example:
+ *
+ * \verbatim
+
+   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group = pool.instantiate();
+
+   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site = group.run();
+
+ * \endverbatim
+ *
+ ******************************************************************************
+ */
+template <typename WORKGROUP_POLICY_T,
+          typename INDEX_T,
+          typename EXTRA_ARGS_T,
+          typename ALLOCATOR_T>
+struct WorkGroup {
+  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+      "WorkGroup: WORKGROUP_POLICY_T must be a workgroup policy");
+  static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
+      "WorkGroup: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
+};
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  WorkSite class template. Owns per run objects from a single run of
+ *         a WorkGroup.
+ *
+ * The WorkSite object is the third member of the workgroup constructs. It is
+ * created by a WorkGroup when calling run and stores any data needed for that
+ * run of that WorkGroup. Because the WorkSite owns data used for the running
+ * of a collection of loops it must not be destroyed before that collection
+ * of loops has finished running.
+ *
+ * Usage example:
+ *
+ * \verbatim
+
+   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site = group.run();
+
+   site.synchronize();
+
+ * \endverbatim
+ *
+ ******************************************************************************
+ */
+template <typename WORKGROUP_POLICY_T,
+          typename INDEX_T,
+          typename EXTRA_ARGS_T,
+          typename ALLOCATOR_T>
+struct WorkSite {
+  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+      "WorkSite: WORKGROUP_POLICY_T must be a workgroup policy");
+  static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
+      "WorkSite: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
+};
+
+
+template <typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename STORAGE_POLICY_T,
+          typename INDEX_T,
+          typename ... Args,
+          typename ALLOCATOR_T>
+struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
+                                ORDER_POLICY_T,
+                                STORAGE_POLICY_T>,
+                INDEX_T,
+                xargs<Args...>,
+                ALLOCATOR_T>
+{
+  using exec_policy = EXEC_POLICY_T;
+  using order_policy = ORDER_POLICY_T;
+  using storage_policy = STORAGE_POLICY_T;
+  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy>;
+  using index_type = INDEX_T;
+  using xarg_type = xargs<Args...>;
+  using Allocator = ALLOCATOR_T;
+
+  using workgroup_type = WorkGroup<policy, index_type, xarg_type, Allocator>;
+  using worksite_type = WorkSite<policy, index_type, xarg_type, Allocator>;
+
+  explicit WorkPool(Allocator const& aloc)
+    : m_storage(aloc)
+  { }
+
+  WorkPool(WorkPool const&) = delete;
+  WorkPool& operator=(WorkPool const&) = delete;
+
+  WorkPool(WorkPool&&) = default;
+  WorkPool& operator=(WorkPool&&) = default;
+
+  size_t num_loops() const
+  {
+    return m_storage.size();
+  }
+
+  size_t storage_bytes() const
+  {
+    return m_storage.storage_size();
+  }
+
+  void reserve(size_t num_loops, size_t storage_bytes)
+  {
+    m_storage.reserve(num_loops, storage_bytes);
+  }
+
+  template < typename segment_T, typename loop_T >
+  inline void enqueue(segment_T&& seg, loop_T&& loop_body)
+  {
+    if (m_storage.begin() == m_storage.end()) {
+      // perform auto-reserve on reuse
+      reserve(m_max_num_loops, m_max_storage_bytes);
+    }
+
+    util::PluginContext context{util::make_context<exec_policy>()};
+    util::callPreCapturePlugins(context);
+
+    using RAJA::util::trigger_updates_before;
+    auto body = trigger_updates_before(loop_body);
+
+    m_runner.enqueue(
+        m_storage, std::forward<segment_T>(seg), std::move(body));
+
+    util::callPostCapturePlugins(context);
+  }
+
+  inline workgroup_type instantiate();
+
+  void clear()
+  {
+    // storage is about to be destroyed
+    // but it was never used so no synchronization necessary
+    m_storage.clear();
+    m_runner.clear();
+  }
+
+  ~WorkPool()
+  {
+    clear();
+  }
+
+private:
+  using workrunner_type = detail::WorkRunner<
+      exec_policy, order_policy, Allocator, index_type, Args...>;
+  using storage_type = detail::WorkStorage<
+      storage_policy, Allocator, typename workrunner_type::vtable_type>;
+
+  friend workgroup_type;
+  friend worksite_type;
+
+  storage_type m_storage;
+  size_t m_max_num_loops = 0;
+  size_t m_max_storage_bytes = 0;
+
+  workrunner_type m_runner;
+};
+
+template <typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename STORAGE_POLICY_T,
+          typename INDEX_T,
+          typename ... Args,
+          typename ALLOCATOR_T>
+struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
+                                 ORDER_POLICY_T,
+                                 STORAGE_POLICY_T>,
+                 INDEX_T,
+                 xargs<Args...>,
+                 ALLOCATOR_T>
+{
+  using exec_policy = EXEC_POLICY_T;
+  using order_policy = ORDER_POLICY_T;
+  using storage_policy = STORAGE_POLICY_T;
+  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy>;
+  using index_type = INDEX_T;
+  using xarg_type = xargs<Args...>;
+  using Allocator = ALLOCATOR_T;
+
+  using workpool_type = WorkPool<policy, index_type, xarg_type, Allocator>;
+  using worksite_type = WorkSite<policy, index_type, xarg_type, Allocator>;
+
+  WorkGroup(WorkGroup const&) = delete;
+  WorkGroup& operator=(WorkGroup const&) = delete;
+
+  WorkGroup(WorkGroup&&) = default;
+  WorkGroup& operator=(WorkGroup&&) = default;
+
+  inline worksite_type run(Args...);
+
+  void clear()
+  {
+    // storage is about to be destroyed
+    // TODO: synchronize
+    m_storage.clear();
+    m_runner.clear();
+  }
+
+  ~WorkGroup()
+  {
+    clear();
+  }
+
+private:
+  using storage_type = typename workpool_type::storage_type;
+  using workrunner_type = typename workpool_type::workrunner_type;
+
+  friend workpool_type;
+  friend worksite_type;
+
+  storage_type m_storage;
+  workrunner_type m_runner;
+
+  WorkGroup(storage_type&& storage, workrunner_type&& runner)
+    : m_storage(std::move(storage))
+    , m_runner(std::move(runner))
+  { }
+};
+
+template <typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename STORAGE_POLICY_T,
+          typename INDEX_T,
+          typename ... Args,
+          typename ALLOCATOR_T>
+struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
+                                ORDER_POLICY_T,
+                                STORAGE_POLICY_T>,
+                INDEX_T,
+                xargs<Args...>,
+                ALLOCATOR_T>
+{
+  using exec_policy = EXEC_POLICY_T;
+  using order_policy = ORDER_POLICY_T;
+  using storage_policy = STORAGE_POLICY_T;
+  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy>;
+  using index_type = INDEX_T;
+  using xarg_type = xargs<Args...>;
+  using Allocator = ALLOCATOR_T;
+
+  using workpool_type = WorkPool<policy, index_type, xarg_type, Allocator>;
+  using workgroup_type = WorkGroup<policy, index_type, xarg_type, Allocator>;
+
+  WorkSite(WorkSite const&) = delete;
+  WorkSite& operator=(WorkSite const&) = delete;
+
+  WorkSite(WorkSite&&) = default;
+  WorkSite& operator=(WorkSite&&) = default;
+
+  void clear()
+  {
+    // resources is about to be released
+    // TODO: synchronize
+  }
+
+  ~WorkSite()
+  {
+    clear();
+  }
+
+private:
+  using workrunner_type = typename workgroup_type::workrunner_type;
+  using per_run_storage = typename workrunner_type::per_run_storage;
+
+  friend workpool_type;
+  friend workgroup_type;
+
+  per_run_storage m_run_storage;
+
+  explicit WorkSite(per_run_storage&& run_storage)
+    : m_run_storage(std::move(run_storage))
+  { }
+};
+
+
+template <typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename STORAGE_POLICY_T,
+          typename INDEX_T,
+          typename ... Args,
+          typename ALLOCATOR_T>
+inline
+typename WorkPool<
+    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T>,
+    INDEX_T,
+    xargs<Args...>,
+    ALLOCATOR_T>::workgroup_type
+WorkPool<
+    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T>,
+    INDEX_T,
+    xargs<Args...>,
+    ALLOCATOR_T>::instantiate()
+{
+  // update max sizes to auto-reserve on reuse
+  m_max_num_loops = std::max(m_storage.size(), m_max_num_loops);
+  m_max_storage_bytes = std::max(m_storage.storage_size(), m_max_storage_bytes);
+
+  // move storage into workgroup
+  return workgroup_type{std::move(m_storage), std::move(m_runner)};
+}
+
+template <typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename STORAGE_POLICY_T,
+          typename INDEX_T,
+          typename ... Args,
+          typename ALLOCATOR_T>
+inline
+typename WorkGroup<
+    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T>,
+    INDEX_T,
+    xargs<Args...>,
+    ALLOCATOR_T>::worksite_type
+WorkGroup<
+    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T>,
+    INDEX_T,
+    xargs<Args...>,
+    ALLOCATOR_T>::run(Args... args)
+{
+  util::PluginContext context{util::make_context<EXEC_POLICY_T>()};
+  util::callPreLaunchPlugins(context);
+
+  // move any per run storage into worksite
+  worksite_type site(m_runner.run(m_storage, std::forward<Args>(args)...));
+
+  util::callPostLaunchPlugins(context);
+
+  return site;
+}
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/WorkGroup/Vtable.hpp b/include/RAJA/pattern/WorkGroup/Vtable.hpp
new file mode 100644
index 0000000000..ba8c3c786b
--- /dev/null
+++ b/include/RAJA/pattern/WorkGroup/Vtable.hpp
@@ -0,0 +1,101 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file providing RAJA Vtable for workgroup.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_PATTERN_WORKGROUP_Vtable_HPP
+#define RAJA_PATTERN_WORKGROUP_Vtable_HPP
+
+
+#include "RAJA/config.hpp"
+
+#include <utility>
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+ * A vtable abstraction
+ *
+ * Provides function pointers for basic functions.
+ */
+template < typename ... CallArgs >
+struct Vtable {
+  using move_sig = void(*)(void* /*dest*/, void* /*src*/);
+  using call_sig = void(*)(const void* /*obj*/, CallArgs... /*args*/);
+  using destroy_sig = void(*)(void* /*obj*/);
+
+  ///
+  /// move construct an object of type T in dest as a copy of a T from src and
+  /// destroy the T obj in src
+  ///
+  template < typename T >
+  static void move_construct_destroy(void* dest, void* src)
+  {
+    T* dest_as_T = static_cast<T*>(dest);
+    T* src_as_T = static_cast<T*>(src);
+    new(dest_as_T) T(std::move(*src_as_T));
+    (*src_as_T).~T();
+  }
+
+  ///
+  /// call the call operator of the object of type T in obj with args
+  ///
+  template < typename T >
+  static void host_call(const void* obj, CallArgs... args)
+  {
+    const T* obj_as_T = static_cast<const T*>(obj);
+    (*obj_as_T)(std::forward<CallArgs>(args)...);
+  }
+  ///
+  template < typename T >
+  static RAJA_DEVICE void device_call(const void* obj, CallArgs... args)
+  {
+    const T* obj_as_T = static_cast<const T*>(obj);
+    (*obj_as_T)(std::forward<CallArgs>(args)...);
+  }
+
+  ///
+  /// destoy the object of type T in obj
+  ///
+  template < typename T >
+  static void destroy(void* obj)
+  {
+    T* obj_as_T = static_cast<T*>(obj);
+    (*obj_as_T).~T();
+  }
+
+  move_sig move_construct_destroy_function_ptr;
+  call_sig call_function_ptr;
+  destroy_sig destroy_function_ptr;
+  size_t size;
+};
+
+/*!
+ * Populate and return a pointer to a Vtable object for the given policy.
+ * NOTE: there is a function overload is in each policy/WorkGroup/Vtable.hpp
+ */
+// template < typename T, typename Vtable_T >
+// inline const Vtable_T* get_Vtable(work_policy const&);
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
new file mode 100644
index 0000000000..444e0b2bf3
--- /dev/null
+++ b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
@@ -0,0 +1,296 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file providing RAJA WorkStorage.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_PATTERN_WORKGROUP_WorkRunner_HPP
+#define RAJA_PATTERN_WORKGROUP_WorkRunner_HPP
+
+#include "RAJA/config.hpp"
+
+#include <utility>
+#include <type_traits>
+
+#include "RAJA/policy/loop/policy.hpp"
+
+#include "RAJA/pattern/forall.hpp"
+
+#include "RAJA/pattern/WorkGroup/Vtable.hpp"
+#include "RAJA/policy/WorkGroup.hpp"
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+ * A body and args holder for storing loops that are being executed in foralls
+ */
+template <typename LoopBody, typename ... Args>
+struct HoldBodyArgs_base
+{
+  // NOTE: This constructor is disabled when body_in is not LoopBody
+  // to avoid it conflicting with the copy and move constructors
+  template < typename body_in,
+      typename = typename std::enable_if<
+        std::is_same<LoopBody, camp::decay<body_in>>::value>::type >
+  HoldBodyArgs_base(body_in&& body, Args... args)
+    : m_body(std::forward<body_in>(body))
+    , m_arg_tuple(std::forward<Args>(args)...)
+  { }
+
+protected:
+  LoopBody m_body;
+  camp::tuple<Args...> m_arg_tuple;
+};
+
+/*!
+ * A body and args holder for storing loops that are being executed in foralls
+ * that run on the host
+ */
+template <typename LoopBody, typename index_type, typename ... Args>
+struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
+{
+  using base = HoldBodyArgs_base<LoopBody, Args...>;
+  using base::base;
+
+  RAJA_INLINE void operator()(index_type i) const
+  {
+    invoke(i, camp::make_idx_seq_t<sizeof...(Args)>{});
+  }
+
+  template < camp::idx_t ... Is >
+  RAJA_INLINE void invoke(index_type i, camp::idx_seq<Is...>) const
+  {
+    this->m_body(i, get<Is>(this->m_arg_tuple)...);
+  }
+};
+
+/*!
+ * A body and args holder for storing loops that are being executed in foralls
+ * that run on the device
+ */
+template <typename LoopBody, typename index_type, typename ... Args>
+struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
+{
+  using base = HoldBodyArgs_base<LoopBody, Args...>;
+  using base::base;
+
+  RAJA_DEVICE RAJA_INLINE void operator()(index_type i) const
+  {
+    invoke(i, camp::make_idx_seq_t<sizeof...(Args)>{});
+  }
+
+  template < camp::idx_t ... Is >
+  RAJA_DEVICE RAJA_INLINE void invoke(index_type i, camp::idx_seq<Is...>) const
+  {
+    this->m_body(i, get<Is>(this->m_arg_tuple)...);
+  }
+};
+
+/*!
+ * A body and segment holder for storing loops that will be executed as foralls
+ */
+template <typename ExecutionPolicy, typename Segment_type, typename LoopBody,
+          typename index_type, typename ... Args>
+struct HoldForall
+{
+  using HoldBodyArgs = typename std::conditional<
+      !type_traits::is_device_exec_policy<ExecutionPolicy>::value,
+      HoldBodyArgs_host<LoopBody, index_type, Args...>,
+      HoldBodyArgs_device<LoopBody, index_type, Args...> >::type;
+
+  template < typename segment_in, typename body_in >
+  HoldForall(segment_in&& segment, body_in&& body)
+    : m_segment(std::forward<segment_in>(segment))
+    , m_body(std::forward<body_in>(body))
+  { }
+
+  RAJA_INLINE void operator()(Args... args) const
+  {
+    wrap::forall(resources::get_resource<ExecutionPolicy>::type::get_default(),
+                 ExecutionPolicy(),
+                 m_segment,
+                 HoldBodyArgs{m_body, std::forward<Args>(args)...});
+  }
+
+private:
+  Segment_type m_segment;
+  LoopBody m_body;
+};
+
+
+/*!
+ * A class that handles running work in a work container
+ */
+template <typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner;
+
+
+/*!
+ * Base class describing storage for ordered runners using forall
+ */
+template <typename FORALL_EXEC_POLICY,
+          typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunnerForallOrdered_base
+{
+  using exec_policy = EXEC_POLICY_T;
+  using order_policy = ORDER_POLICY_T;
+  using Allocator = ALLOCATOR_T;
+  using index_type = INDEX_T;
+
+  using forall_exec_policy = FORALL_EXEC_POLICY;
+  using vtable_type = Vtable<Args...>;
+
+  WorkRunnerForallOrdered_base() = default;
+
+  WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base const&) = delete;
+  WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base const&) = delete;
+
+  WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base &&) = default;
+  WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base &&) = default;
+
+  // The type  that will hold the segment and loop body in work storage
+  template < typename segment_type, typename loop_type >
+  using holder_type = HoldForall<forall_exec_policy, segment_type, loop_type,
+                                 index_type, Args...>;
+
+  // The policy indicating where the call function is invoked
+  // in this case the values are called on the host in a loop
+  using vtable_exec_policy = RAJA::loop_work;
+
+  // runner interfaces with storage to enqueue so the runner can get
+  // information from the segment and loop at enqueue time
+  template < typename WorkContainer, typename segment_T, typename loop_T >
+  inline void enqueue(WorkContainer& storage, segment_T&& seg, loop_T&& loop)
+  {
+    using holder = holder_type<camp::decay<segment_T>, camp::decay<loop_T>>;
+
+    storage.template emplace<holder>(
+        get_Vtable<holder, vtable_type>(vtable_exec_policy{}),
+        std::forward<segment_T>(seg), std::forward<loop_T>(loop));
+  }
+
+  // clear any state so ready to be destroyed or reused
+  void clear()
+  { }
+
+  // no extra storage required here
+  using per_run_storage = int;
+};
+
+/*!
+ * Runs work in a storage container in order using forall
+ */
+template <typename FORALL_EXEC_POLICY,
+          typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunnerForallOrdered
+    : WorkRunnerForallOrdered_base<
+      FORALL_EXEC_POLICY,
+      EXEC_POLICY_T,
+      ORDER_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>
+{
+  using base = WorkRunnerForallOrdered_base<
+      FORALL_EXEC_POLICY,
+      EXEC_POLICY_T,
+      ORDER_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>;
+  using base::base;
+
+  // run the loops using forall in the order that they were enqueued
+  template < typename WorkContainer >
+  typename base::per_run_storage run(WorkContainer const& storage, Args... args) const
+  {
+    using value_type = typename WorkContainer::value_type;
+
+    typename base::per_run_storage run_storage{};
+
+    auto end = storage.end();
+    for (auto iter = storage.begin(); iter != end; ++iter) {
+      value_type::call(&*iter, args...);
+    }
+
+    return run_storage;
+  }
+};
+
+/*!
+ * Runs work in a storage container in reverse order using forall
+ */
+template <typename FORALL_EXEC_POLICY,
+          typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunnerForallReverse
+    : WorkRunnerForallOrdered_base<
+      FORALL_EXEC_POLICY,
+      EXEC_POLICY_T,
+      ORDER_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>
+{
+  using base = WorkRunnerForallOrdered_base<
+      FORALL_EXEC_POLICY,
+      EXEC_POLICY_T,
+      ORDER_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>;
+  using base::base;
+
+  // run the loops using forall in the reverse order to the order they were enqueued
+  template < typename WorkContainer >
+  typename base::per_run_storage run(WorkContainer const& storage, Args... args) const
+  {
+    using value_type = typename WorkContainer::value_type;
+
+    typename base::per_run_storage run_storage{};
+
+    auto begin = storage.begin();
+    for (auto iter = storage.end(); iter != begin; --iter) {
+      value_type::call(&*(iter-1), args...);
+    }
+
+    return run_storage;
+  }
+};
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
new file mode 100644
index 0000000000..7675733565
--- /dev/null
+++ b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
@@ -0,0 +1,1048 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file providing RAJA WorkStorage.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_PATTERN_WORKGROUP_WorkStorage_HPP
+#define RAJA_PATTERN_WORKGROUP_WorkStorage_HPP
+
+#include "RAJA/config.hpp"
+
+#include <cstddef>
+#include <memory>
+#include <utility>
+#include <type_traits>
+
+#include "RAJA/util/Operators.hpp"
+#include "RAJA/util/macros.hpp"
+
+#include "RAJA/internal/RAJAVec.hpp"
+
+#include "RAJA/pattern/WorkGroup/WorkStruct.hpp"
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+// iterator class that implements the random access iterator interface
+// in terms of a in terms of a few basic operations
+//   operator *  (                      )
+//   operator += ( difference_type      )
+//   operator -  ( iterator_base const& )
+//   operator == ( iterator_base const& )
+//   operator <  ( iterator_base const& )
+template < typename iterator_base >
+struct random_access_iterator : iterator_base
+{
+  using base = iterator_base;
+  using value_type = const typename base::value_type;
+  using pointer = typename base::pointer;
+  using reference = typename base::reference;
+  using difference_type = typename base::difference_type;
+  using iterator_category = std::random_access_iterator_tag;
+
+  using base::base;
+
+  random_access_iterator(random_access_iterator const&) = default;
+  random_access_iterator(random_access_iterator &&) = default;
+
+  random_access_iterator& operator=(random_access_iterator const&) = default;
+  random_access_iterator& operator=(random_access_iterator &&) = default;
+
+
+  RAJA_HOST_DEVICE reference operator*() const
+  {
+    return *static_cast<base const&>(*this);
+  }
+
+  RAJA_HOST_DEVICE pointer operator->() const
+  {
+    return &(*(*this));
+  }
+
+  RAJA_HOST_DEVICE reference operator[](difference_type i) const
+  {
+    random_access_iterator copy = *this;
+    copy += i;
+    return *copy;
+  }
+
+  RAJA_HOST_DEVICE random_access_iterator& operator++()
+  {
+    (*this) += 1;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE random_access_iterator operator++(int)
+  {
+    random_access_iterator copy = *this;
+    ++(*this);
+    return copy;
+  }
+
+  RAJA_HOST_DEVICE random_access_iterator& operator--()
+  {
+    (*this) -= 1;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE random_access_iterator operator--(int)
+  {
+    random_access_iterator copy = *this;
+    --(*this);
+    return copy;
+  }
+
+  RAJA_HOST_DEVICE random_access_iterator& operator+=(difference_type rhs)
+  {
+    static_cast<base&>(*this) += rhs;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE random_access_iterator& operator-=(difference_type rhs)
+  {
+    (*this) += -rhs;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE friend inline random_access_iterator operator+(
+      random_access_iterator const& lhs, difference_type rhs)
+  {
+    random_access_iterator copy = lhs;
+    copy += rhs;
+    return copy;
+  }
+
+  RAJA_HOST_DEVICE friend inline random_access_iterator operator+(
+      difference_type lhs, random_access_iterator const& rhs)
+  {
+    random_access_iterator copy = rhs;
+    copy += lhs;
+    return copy;
+  }
+
+  RAJA_HOST_DEVICE friend inline random_access_iterator operator-(
+      random_access_iterator const& lhs, difference_type rhs)
+  {
+    random_access_iterator copy = lhs;
+    copy -= rhs;
+    return copy;
+  }
+
+  RAJA_HOST_DEVICE friend inline difference_type operator-(
+      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  {
+    return static_cast<base const&>(lhs) - static_cast<base const&>(rhs);
+  }
+
+  RAJA_HOST_DEVICE friend inline bool operator==(
+      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  {
+    return static_cast<base const&>(lhs) == static_cast<base const&>(rhs);
+  }
+
+  RAJA_HOST_DEVICE friend inline bool operator!=(
+      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  {
+    return !(lhs == rhs);
+  }
+
+  RAJA_HOST_DEVICE friend inline bool operator<(
+      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  {
+    return static_cast<base const&>(lhs) < static_cast<base const&>(rhs);
+  }
+
+  RAJA_HOST_DEVICE friend inline bool operator<=(
+      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  {
+    return !(rhs < lhs);
+  }
+
+  RAJA_HOST_DEVICE friend inline bool operator>(
+      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  {
+    return rhs < lhs;
+  }
+
+  RAJA_HOST_DEVICE friend inline bool operator>=(
+      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  {
+    return !(lhs < rhs);
+  }
+};
+
+
+/*!
+ * A storage container for work groups
+ */
+template < typename STORAGE_POLICY_T, typename ALLOCATOR_T, typename Vtable_T >
+class WorkStorage;
+
+template < typename ALLOCATOR_T, typename Vtable_T >
+class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Vtable_T>
+{
+  using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
+  using propagate_on_container_copy_assignment =
+      typename allocator_traits_type::propagate_on_container_copy_assignment;
+  using propagate_on_container_move_assignment =
+      typename allocator_traits_type::propagate_on_container_move_assignment;
+  using propagate_on_container_swap            =
+      typename allocator_traits_type::propagate_on_container_swap;
+  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
+      "WorkStorage expects an allocator for 'char's.");
+public:
+  using storage_policy = RAJA::array_of_pointers;
+  using vtable_type = Vtable_T;
+
+  template < typename holder >
+  using true_value_type = WorkStruct<sizeof(holder), vtable_type>;
+
+  using value_type = GenericWorkStruct<vtable_type>;
+  using allocator_type = ALLOCATOR_T;
+  using size_type = std::size_t;
+  using difference_type = std::ptrdiff_t;
+  using reference = value_type&;
+  using const_reference = const value_type&;
+  using pointer = value_type*;
+  using const_pointer = const value_type*;
+
+private:
+  // struct used in storage vector to retain pointer and allocation size
+  struct pointer_and_size
+  {
+    pointer ptr;
+    size_type size;
+  };
+
+public:
+
+  // iterator base class for accessing stored WorkStructs outside of the container
+  struct const_iterator_base
+  {
+    using value_type = const typename WorkStorage::value_type;
+    using pointer = typename WorkStorage::const_pointer;
+    using reference = typename WorkStorage::const_reference;
+    using difference_type = typename WorkStorage::difference_type;
+    using iterator_category = std::random_access_iterator_tag;
+
+    const_iterator_base(const pointer_and_size* ptrptr)
+      : m_ptrptr(ptrptr)
+    { }
+
+    RAJA_HOST_DEVICE reference operator*() const
+    {
+      return *(m_ptrptr->ptr);
+    }
+
+    RAJA_HOST_DEVICE const_iterator_base& operator+=(difference_type n)
+    {
+      m_ptrptr += n;
+      return *this;
+    }
+
+    RAJA_HOST_DEVICE friend inline difference_type operator-(
+        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    {
+      return lhs_iter.m_ptrptr - rhs_iter.m_ptrptr;
+    }
+
+    RAJA_HOST_DEVICE friend inline bool operator==(
+        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    {
+      return lhs_iter.m_ptrptr == rhs_iter.m_ptrptr;
+    }
+
+    RAJA_HOST_DEVICE friend inline bool operator<(
+        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    {
+      return lhs_iter.m_ptrptr < rhs_iter.m_ptrptr;
+    }
+
+  private:
+    const pointer_and_size* m_ptrptr;
+  };
+
+  using const_iterator = random_access_iterator<const_iterator_base>;
+
+
+  explicit WorkStorage(allocator_type const& aloc)
+    : m_vec(0, aloc)
+    , m_aloc(aloc)
+  { }
+
+  WorkStorage(WorkStorage const&) = delete;
+  WorkStorage& operator=(WorkStorage const&) = delete;
+
+  WorkStorage(WorkStorage&& rhs)
+    : m_vec(std::move(rhs.m_vec))
+    , m_aloc(std::move(rhs.m_aloc))
+  { }
+
+  WorkStorage& operator=(WorkStorage&& rhs)
+  {
+    if (this != &rhs) {
+      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+    }
+    return *this;
+  }
+
+  // reserve may be used to allocate enough memory to store num_loops
+  // and loop_storage_size is ignored in this version because each
+  // object has its own allocation
+  void reserve(size_type num_loops, size_type loop_storage_size)
+  {
+    RAJA_UNUSED_VAR(loop_storage_size);
+    m_vec.reserve(num_loops);
+  }
+
+  // number of loops stored
+  size_type size() const
+  {
+    return m_vec.size();
+  }
+
+  const_iterator begin() const
+  {
+    return const_iterator(m_vec.begin());
+  }
+
+  const_iterator end() const
+  {
+    return const_iterator(m_vec.end());
+  }
+
+  // number of bytes used for storage of loops
+  size_type storage_size() const
+  {
+    size_type storage_size_nbytes = 0;
+    for (size_t i = 0; i < m_vec.size(); ++i) {
+      storage_size_nbytes += m_vec[i].size;
+    }
+    return storage_size_nbytes;
+  }
+
+  template < typename holder, typename ... holder_ctor_args >
+  void emplace(const vtable_type* vtable, holder_ctor_args&&... ctor_args)
+  {
+    m_vec.emplace_back(create_value<holder>(
+        vtable, std::forward<holder_ctor_args>(ctor_args)...));
+  }
+
+  // destroy all stored loops, deallocates all storage
+  void clear()
+  {
+    while (!m_vec.empty()) {
+      destroy_value(m_vec.back());
+      m_vec.pop_back();
+    }
+    m_vec.shrink_to_fit();
+  }
+
+  ~WorkStorage()
+  {
+    clear();
+  }
+
+private:
+  RAJAVec<pointer_and_size, typename allocator_traits_type::template rebind_alloc<pointer_and_size>> m_vec;
+  allocator_type m_aloc;
+
+  // move assignment if allocator propagates on move assignment
+  void move_assign_private(WorkStorage&& rhs, std::true_type)
+  {
+    clear();
+    m_vec = std::move(rhs.m_vec);
+    m_aloc = std::move(rhs.m_aloc);
+  }
+
+  // move assignment if allocator does not propagate on move assignment
+  void move_assign_private(WorkStorage&& rhs, std::false_type)
+  {
+    clear();
+    if (m_aloc == rhs.m_aloc) {
+      // take storage if allocators compare equal
+      m_vec = std::move(rhs.m_vec);
+    } else {
+      // allocate new storage if allocators do not compare equal
+      for (size_type i = 0; i < rhs.m_vec.size(); ++i) {
+        m_vec.emplace_back(move_destroy_value(std::move(rhs), rhs.m_vec[i]));
+      }
+      rhs.m_vec.clear();
+      rhs.clear();
+    }
+  }
+
+  // allocate and construct value in storage
+  template < typename holder, typename ... holder_ctor_args >
+  pointer_and_size create_value(const vtable_type* vtable,
+                                holder_ctor_args&&... ctor_args)
+  {
+    const size_type value_size = sizeof(true_value_type<holder>);
+
+    pointer value_ptr = reinterpret_cast<pointer>(
+        allocator_traits_type::allocate(m_aloc, value_size));
+
+    value_type::template construct<holder>(
+        value_ptr, vtable, std::forward<holder_ctor_args>(ctor_args)...);
+
+    return pointer_and_size{value_ptr, value_size};
+  }
+
+  // allocate and move construct object as copy of other value and
+  // destroy and deallocate other value
+  pointer_and_size move_destroy_value(WorkStorage&& rhs,
+                                      pointer_and_size other_value_and_size)
+  {
+    pointer value_ptr = reinterpret_cast<pointer>(
+        allocator_traits_type::allocate(m_aloc, other_value_and_size.size));
+
+    value_type::move_destroy(value_ptr, other_value_and_size.ptr);
+
+    allocator_traits_type::deallocate(rhs.m_aloc,
+        reinterpret_cast<char*>(other_value_and_size.ptr), other_value_and_size.size);
+
+    return pointer_and_size{value_ptr, other_value_and_size.size};
+  }
+
+  // destroy and deallocate value
+  void destroy_value(pointer_and_size value_and_size_ptr)
+  {
+    value_type::destroy(value_and_size_ptr.ptr);
+    allocator_traits_type::deallocate(m_aloc,
+        reinterpret_cast<char*>(value_and_size_ptr.ptr), value_and_size_ptr.size);
+  }
+};
+
+template < typename ALLOCATOR_T, typename Vtable_T >
+class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Vtable_T>
+{
+  using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
+  using propagate_on_container_copy_assignment =
+      typename allocator_traits_type::propagate_on_container_copy_assignment;
+  using propagate_on_container_move_assignment =
+      typename allocator_traits_type::propagate_on_container_move_assignment;
+  using propagate_on_container_swap            =
+      typename allocator_traits_type::propagate_on_container_swap;
+  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
+      "WorkStorage expects an allocator for 'char's.");
+public:
+  using storage_policy = RAJA::ragged_array_of_objects;
+  using vtable_type = Vtable_T;
+
+  template < typename holder >
+  using true_value_type = WorkStruct<sizeof(holder), vtable_type>;
+
+  using value_type = GenericWorkStruct<vtable_type>;
+  using allocator_type = ALLOCATOR_T;
+  using size_type = std::size_t;
+  using difference_type = std::ptrdiff_t;
+  using reference = value_type&;
+  using const_reference = const value_type&;
+  using pointer = value_type*;
+  using const_pointer = const value_type*;
+
+  // iterator base class for accessing stored WorkStructs outside of the container
+  struct const_iterator_base
+  {
+    using value_type = const typename WorkStorage::value_type;
+    using pointer = typename WorkStorage::const_pointer;
+    using reference = typename WorkStorage::const_reference;
+    using difference_type = typename WorkStorage::difference_type;
+    using iterator_category = std::random_access_iterator_tag;
+
+    const_iterator_base(const char* array_begin, const size_type* offset_iter)
+      : m_array_begin(array_begin)
+      , m_offset_iter(offset_iter)
+    { }
+
+    RAJA_HOST_DEVICE reference operator*() const
+    {
+      return *reinterpret_cast<pointer>(
+          m_array_begin + *m_offset_iter);
+    }
+
+    RAJA_HOST_DEVICE const_iterator_base& operator+=(difference_type n)
+    {
+      m_offset_iter += n;
+      return *this;
+    }
+
+    RAJA_HOST_DEVICE friend inline difference_type operator-(
+        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    {
+      return lhs_iter.m_offset_iter - rhs_iter.m_offset_iter;
+    }
+
+    RAJA_HOST_DEVICE friend inline bool operator==(
+        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    {
+      return lhs_iter.m_offset_iter == rhs_iter.m_offset_iter;
+    }
+
+    RAJA_HOST_DEVICE friend inline bool operator<(
+        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    {
+      return lhs_iter.m_offset_iter < rhs_iter.m_offset_iter;
+    }
+
+  private:
+    const char* m_array_begin;
+    const size_type* m_offset_iter;
+  };
+
+  using const_iterator = random_access_iterator<const_iterator_base>;
+
+
+  explicit WorkStorage(allocator_type const& aloc)
+    : m_offsets(0, aloc)
+    , m_aloc(aloc)
+  { }
+
+  WorkStorage(WorkStorage const&) = delete;
+  WorkStorage& operator=(WorkStorage const&) = delete;
+
+  WorkStorage(WorkStorage&& rhs)
+    : m_offsets(std::move(rhs.m_offsets))
+    , m_array_begin(rhs.m_array_begin)
+    , m_array_end(rhs.m_array_end)
+    , m_array_cap(rhs.m_array_cap)
+    , m_aloc(std::move(rhs.m_aloc))
+  {
+    rhs.m_array_begin = nullptr;
+    rhs.m_array_end = nullptr;
+    rhs.m_array_cap = nullptr;
+  }
+
+  WorkStorage& operator=(WorkStorage&& rhs)
+  {
+    if (this != &rhs) {
+      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+    }
+    return *this;
+  }
+
+  // reserve space for num_loops in the array of offsets
+  // and space for loop_storage_size bytes of loop storage
+  void reserve(size_type num_loops, size_type loop_storage_size)
+  {
+    m_offsets.reserve(num_loops);
+    array_reserve(loop_storage_size);
+  }
+
+  // number of loops stored
+  size_type size() const
+  {
+    return m_offsets.size();
+  }
+
+  const_iterator begin() const
+  {
+    return const_iterator(m_array_begin, m_offsets.begin());
+  }
+
+  const_iterator end() const
+  {
+    return const_iterator(m_array_begin, m_offsets.end());
+  }
+
+  // number of bytes used for storage of loops
+  size_type storage_size() const
+  {
+    return m_array_end - m_array_begin;
+  }
+
+  template < typename holder, typename ... holder_ctor_args >
+  void emplace(const vtable_type* vtable, holder_ctor_args&&... ctor_args)
+  {
+    size_type value_offset = storage_size();
+    size_type value_size   = create_value<holder>(value_offset,
+        vtable, std::forward<holder_ctor_args>(ctor_args)...);
+    m_offsets.emplace_back(value_offset);
+    m_array_end += value_size;
+  }
+
+  // destroy loops and deallocate all storage
+  void clear()
+  {
+    array_clear();
+    if (m_array_begin != nullptr) {
+      allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+      m_array_begin = nullptr;
+      m_array_end   = nullptr;
+      m_array_cap   = nullptr;
+    }
+  }
+
+  ~WorkStorage()
+  {
+    clear();
+  }
+
+private:
+  RAJAVec<size_type, typename allocator_traits_type::template rebind_alloc<size_type>> m_offsets;
+  char* m_array_begin = nullptr;
+  char* m_array_end   = nullptr;
+  char* m_array_cap   = nullptr;
+  allocator_type m_aloc;
+
+  // move assignment if allocator propagates on move assignment
+  void move_assign_private(WorkStorage&& rhs, std::true_type)
+  {
+    clear();
+
+    m_offsets     = std::move(rhs.m_offsets);
+    m_array_begin = rhs.m_array_begin;
+    m_array_end   = rhs.m_array_end  ;
+    m_array_cap   = rhs.m_array_cap  ;
+    m_aloc        = std::move(rhs.m_aloc);
+
+    rhs.m_array_begin = nullptr;
+    rhs.m_array_end   = nullptr;
+    rhs.m_array_cap   = nullptr;
+  }
+
+  // move assignment if allocator does not propagate on move assignment
+  void move_assign_private(WorkStorage&& rhs, std::false_type)
+  {
+    clear();
+    if (m_aloc == rhs.m_aloc) {
+
+      m_offsets     = std::move(rhs.m_offsets);
+      m_array_begin = rhs.m_array_begin;
+      m_array_end   = rhs.m_array_end  ;
+      m_array_cap   = rhs.m_array_cap  ;
+
+      rhs.m_array_begin = nullptr;
+      rhs.m_array_end   = nullptr;
+      rhs.m_array_cap   = nullptr;
+    } else {
+      array_reserve(rhs.storage_size());
+
+      for (size_type i = 0; i < rhs.size(); ++i) {
+        m_array_end = m_array_begin + rhs.m_offsets[i];
+        move_destroy_value(m_array_end, rhs.m_array_begin + rhs.m_offsets[i]);
+        m_offsets.emplace_back(rhs.m_offsets[i]);
+      }
+      m_array_end = m_array_begin + rhs.storage_size();
+      rhs.m_array_end = rhs.m_array_begin;
+      rhs.m_offsets.clear();
+      rhs.clear();
+    }
+  }
+
+  // get loop storage capacity, used and unused in bytes
+  size_type storage_capacity() const
+  {
+    return m_array_cap - m_array_begin;
+  }
+
+  // get unused loop storage capacity in bytes
+  size_type storage_unused() const
+  {
+    return m_array_cap - m_array_end;
+  }
+
+  // reserve space for loop_storage_size bytes of loop storage
+  void array_reserve(size_type loop_storage_size)
+  {
+    if (loop_storage_size > storage_capacity()) {
+
+      char* new_array_begin =
+          allocator_traits_type::allocate(m_aloc, loop_storage_size);
+      char* new_array_end   = new_array_begin + storage_size();
+      char* new_array_cap   = new_array_begin + loop_storage_size;
+
+      for (size_type i = 0; i < size(); ++i) {
+        move_destroy_value(new_array_begin + m_offsets[i],
+                             m_array_begin + m_offsets[i]);
+      }
+
+      if (m_array_begin != nullptr) {
+        allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+      }
+
+      m_array_begin = new_array_begin;
+      m_array_end   = new_array_end  ;
+      m_array_cap   = new_array_cap  ;
+    }
+  }
+
+  // destroy loop objects (does not deallocate array storage)
+  void array_clear()
+  {
+    while (!m_offsets.empty()) {
+      destroy_value(m_offsets.back());
+      m_array_end = m_array_begin + m_offsets.back();
+      m_offsets.pop_back();
+    }
+    m_offsets.shrink_to_fit();
+  }
+
+  // ensure there is enough storage to hold the next loop body at value offset
+  // and store the loop body
+  template < typename holder, typename ... holder_ctor_args >
+  size_type create_value(size_type value_offset,
+                         const vtable_type* vtable,
+                         holder_ctor_args&&... ctor_args)
+  {
+    const size_type value_size = sizeof(true_value_type<holder>);
+
+    if (value_size > storage_unused()) {
+      array_reserve(std::max(storage_size() + value_size, 2*storage_capacity()));
+    }
+
+    pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
+
+    value_type::template construct<holder>(
+        value_ptr, vtable, std::forward<holder_ctor_args>(ctor_args)...);
+
+    return value_size;
+  }
+
+  // move construct the loop body into value from other, and destroy the
+  // loop body in other
+  void move_destroy_value(char* value_ptr, char* other_value_ptr)
+  {
+    value_type::move_destroy(reinterpret_cast<pointer>(value_ptr),
+                             reinterpret_cast<pointer>(other_value_ptr));
+  }
+
+  // destroy the loop body at value offset
+  void destroy_value(size_type value_offset)
+  {
+    pointer value_ptr =
+        reinterpret_cast<pointer>(m_array_begin + value_offset);
+    value_type::destroy(value_ptr);
+  }
+};
+
+template < typename ALLOCATOR_T, typename Vtable_T >
+class WorkStorage<RAJA::constant_stride_array_of_objects,
+                  ALLOCATOR_T,
+                  Vtable_T>
+{
+  using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
+  using propagate_on_container_copy_assignment =
+      typename allocator_traits_type::propagate_on_container_copy_assignment;
+  using propagate_on_container_move_assignment =
+      typename allocator_traits_type::propagate_on_container_move_assignment;
+  using propagate_on_container_swap            =
+      typename allocator_traits_type::propagate_on_container_swap;
+  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
+      "WorkStorage expects an allocator for 'char's.");
+public:
+  using storage_policy = RAJA::constant_stride_array_of_objects;
+  using vtable_type = Vtable_T;
+
+  template < typename holder >
+  using true_value_type = WorkStruct<sizeof(holder), vtable_type>;
+
+  using value_type = GenericWorkStruct<vtable_type>;
+  using allocator_type = ALLOCATOR_T;
+  using size_type = std::size_t;
+  using difference_type = std::ptrdiff_t;
+  using reference = value_type&;
+  using const_reference = const value_type&;
+  using pointer = value_type*;
+  using const_pointer = const value_type*;
+
+  // iterator base class for accessing stored WorkStructs outside of the container
+  struct const_iterator_base
+  {
+    using value_type = const typename WorkStorage::value_type;
+    using pointer = typename WorkStorage::const_pointer;
+    using reference = typename WorkStorage::const_reference;
+    using difference_type = typename WorkStorage::difference_type;
+    using iterator_category = std::random_access_iterator_tag;
+
+    const_iterator_base(const char* array_pos, size_type stride)
+      : m_array_pos(array_pos)
+      , m_stride(stride)
+    { }
+
+    RAJA_HOST_DEVICE reference operator*() const
+    {
+      return *reinterpret_cast<const value_type*>(m_array_pos);
+    }
+
+    RAJA_HOST_DEVICE const_iterator_base& operator+=(difference_type n)
+    {
+      m_array_pos += n * m_stride;
+      return *this;
+    }
+
+    RAJA_HOST_DEVICE friend inline difference_type operator-(
+        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    {
+      return (lhs_iter.m_array_pos - rhs_iter.m_array_pos) / lhs_iter.m_stride;
+    }
+
+    RAJA_HOST_DEVICE friend inline bool operator==(
+        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    {
+      return lhs_iter.m_array_pos == rhs_iter.m_array_pos;
+    }
+
+    RAJA_HOST_DEVICE friend inline bool operator<(
+        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    {
+      return lhs_iter.m_array_pos < rhs_iter.m_array_pos;
+    }
+
+  private:
+    const char* m_array_pos;
+    size_type m_stride;
+  };
+
+  using const_iterator = random_access_iterator<const_iterator_base>;
+
+
+  explicit WorkStorage(allocator_type const& aloc)
+    : m_aloc(aloc)
+  { }
+
+  WorkStorage(WorkStorage const&) = delete;
+  WorkStorage& operator=(WorkStorage const&) = delete;
+
+  WorkStorage(WorkStorage&& rhs)
+    : m_aloc(std::move(rhs.m_aloc))
+    , m_stride(rhs.m_stride)
+    , m_array_begin(rhs.m_array_begin)
+    , m_array_end(rhs.m_array_end)
+    , m_array_cap(rhs.m_array_cap)
+  {
+    // do not reset stride, leave it for reuse
+    rhs.m_array_begin = nullptr;
+    rhs.m_array_end   = nullptr;
+    rhs.m_array_cap   = nullptr;
+  }
+
+  WorkStorage& operator=(WorkStorage&& rhs)
+  {
+    if (this != &rhs) {
+      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+    }
+    return *this;
+  }
+
+  // reserve space for at least loop_storage_size bytes of loop storage
+  // and num_loops at current stride
+  void reserve(size_type num_loops, size_type loop_storage_size)
+  {
+    size_type num_storage_loops =
+        std::max(num_loops, (loop_storage_size + m_stride - 1) / m_stride);
+    array_reserve(num_storage_loops*m_stride, m_stride);
+  }
+
+  // number of loops stored
+  size_type size() const
+  {
+    return storage_size() / m_stride;
+  }
+
+  const_iterator begin() const
+  {
+    return const_iterator(m_array_begin, m_stride);
+  }
+
+  const_iterator end() const
+  {
+    return const_iterator(m_array_end, m_stride);
+  }
+
+  // amount of storage in bytes used to store loops
+  size_type storage_size() const
+  {
+    return m_array_end - m_array_begin;
+  }
+
+  template < typename holder, typename ... holder_ctor_args >
+  void emplace(const vtable_type* vtable, holder_ctor_args&&... ctor_args)
+  {
+    create_value<holder>(vtable, std::forward<holder_ctor_args>(ctor_args)...);
+    m_array_end += m_stride;
+  }
+
+  // destroy stored loop bodies and deallocates all storage
+  void clear()
+  {
+    array_clear();
+    if (m_array_begin != nullptr) {
+      allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+      m_array_begin = nullptr;
+      m_array_end   = nullptr;
+      m_array_cap   = nullptr;
+    }
+  }
+
+  ~WorkStorage()
+  {
+    clear();
+  }
+
+private:
+  allocator_type m_aloc;
+  size_type m_stride     = 1; // can't be 0 because size divides stride
+  char* m_array_begin = nullptr;
+  char* m_array_end   = nullptr;
+  char* m_array_cap   = nullptr;
+
+  // move assignment if allocator propagates on move assignment
+  void move_assign_private(WorkStorage&& rhs, std::true_type)
+  {
+    clear();
+
+    m_aloc        = std::move(rhs.m_aloc);
+    m_stride      = rhs.m_stride     ;
+    m_array_begin = rhs.m_array_begin;
+    m_array_end   = rhs.m_array_end  ;
+    m_array_cap   = rhs.m_array_cap  ;
+
+    // do not reset stride, leave it for reuse
+    rhs.m_array_begin = nullptr;
+    rhs.m_array_end   = nullptr;
+    rhs.m_array_cap   = nullptr;
+  }
+
+  // move assignment if allocator does not propagate on move assignment
+  void move_assign_private(WorkStorage&& rhs, std::false_type)
+  {
+    clear();
+    if (m_aloc == rhs.m_aloc) {
+
+      m_stride      = rhs.m_stride     ;
+      m_array_begin = rhs.m_array_begin;
+      m_array_end   = rhs.m_array_end  ;
+      m_array_cap   = rhs.m_array_cap  ;
+
+      // do not reset stride, leave it for reuse
+      rhs.m_array_begin = nullptr;
+      rhs.m_array_end   = nullptr;
+      rhs.m_array_cap   = nullptr;
+    } else {
+
+      m_stride = rhs.m_stride;
+      array_reserve(rhs.storage_size(), rhs.m_stride);
+
+      for (size_type i = 0; i < rhs.size(); ++i) {
+        move_destroy_value(m_array_end, rhs.m_array_begin + i * rhs.m_stride);
+        m_array_end += m_stride;
+      }
+      rhs.m_array_end = rhs.m_array_begin;
+      rhs.clear();
+    }
+  }
+
+  // storage capacity, used and unused, in bytes
+  size_type storage_capacity() const
+  {
+    return m_array_cap - m_array_begin;
+  }
+
+  // unused storage capacity in bytes
+  size_type storage_unused() const
+  {
+    return m_array_cap - m_array_end;
+  }
+
+  // allocate enough storage for loop_storage_size bytes with
+  // each loop body separated by new_stride bytes
+  // note that this can reallocate storage with or without changing
+  // the storage stride
+  // Note that loop_storage_size must be a multiple of new_stride
+  void array_reserve(size_type loop_storage_size, size_type new_stride)
+  {
+    if (loop_storage_size > storage_capacity() || new_stride > m_stride) {
+
+      char* new_array_begin =
+          allocator_traits_type::allocate(m_aloc, loop_storage_size);
+      char* new_array_end   = new_array_begin + size() * new_stride;
+      char* new_array_cap   = new_array_begin + loop_storage_size;
+
+      for (size_type i = 0; i < size(); ++i) {
+        move_destroy_value(new_array_begin + i * new_stride,
+                             m_array_begin + i *   m_stride);
+      }
+
+      if (m_array_begin != nullptr) {
+        allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+      }
+
+      m_stride      = new_stride     ;
+      m_array_begin = new_array_begin;
+      m_array_end   = new_array_end  ;
+      m_array_cap   = new_array_cap  ;
+    }
+  }
+
+  // destroy the loops in storage (does not deallocate loop storage)
+  void array_clear()
+  {
+    for (size_type value_offset = storage_size(); value_offset > 0; value_offset -= m_stride) {
+      destroy_value(value_offset - m_stride);
+      m_array_end -= m_stride;
+    }
+  }
+
+  // ensure there is enough storage to store the loop body
+  // and construct the body in storage.
+  template < typename holder, typename ... holder_ctor_args >
+  void create_value(const vtable_type* vtable,
+                    holder_ctor_args&&... ctor_args)
+  {
+    const size_type value_size = sizeof(true_value_type<holder>);
+
+    if (value_size > storage_unused() && value_size <= m_stride) {
+      array_reserve(std::max(storage_size() + m_stride, 2*storage_capacity()),
+                    m_stride);
+    } else if (value_size > m_stride) {
+      array_reserve((size()+1)*value_size,
+                    value_size);
+    }
+
+    size_type value_offset = storage_size();
+    pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
+
+    value_type::template construct<holder>(
+        value_ptr, vtable, std::forward<holder_ctor_args>(ctor_args)...);
+  }
+
+  // move construct the loop body in value from other and
+  // destroy the loop body in other
+  void move_destroy_value(char* value_ptr,
+                          char* other_value_ptr)
+  {
+    value_type::move_destroy(reinterpret_cast<pointer>(value_ptr),
+                             reinterpret_cast<pointer>(other_value_ptr));
+  }
+
+  // destroy the loop body at value offset
+  void destroy_value(size_type value_offset)
+  {
+    pointer value_ptr =
+        reinterpret_cast<pointer>(m_array_begin + value_offset);
+    value_type::destroy(value_ptr);
+  }
+};
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
new file mode 100644
index 0000000000..020352efd2
--- /dev/null
+++ b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
@@ -0,0 +1,114 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file providing RAJA WorkStruct for workgroup.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_PATTERN_WORKGROUP_WorkStruct_HPP
+#define RAJA_PATTERN_WORKGROUP_WorkStruct_HPP
+
+#include "RAJA/config.hpp"
+
+#include <utility>
+
+#include "RAJA/pattern/WorkGroup/Vtable.hpp"
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+ * A struct that gives a generic way to layout memory for different loops
+ */
+template < size_t size, typename Vtable_T >
+struct WorkStruct;
+
+/*!
+ * Generic struct used to layout memory for structs of unknown size.
+ * Assumptions for any size (checked in construct):
+ *   offsetof(GenericWorkStruct<>, obj) == offsetof(WorkStruct<size>, obj)
+ *   sizeof(GenericWorkStruct) <= sizeof(WorkStruct<size>)
+ */
+template < typename Vtable_T >
+using GenericWorkStruct = WorkStruct<alignof(std::max_align_t), Vtable_T>;
+
+template < size_t size, typename ... CallArgs >
+struct WorkStruct<size, Vtable<CallArgs...>>
+{
+  using vtable_type = Vtable<CallArgs...>;
+
+  // construct a WorkStruct with a value of type holder from the args and
+  // check a variety of constraints at compile time
+  template < typename holder, typename ... holder_ctor_args >
+  static RAJA_INLINE
+  void construct(void* ptr, const vtable_type* vtable, holder_ctor_args&&... ctor_args)
+  {
+    using true_value_type = WorkStruct<sizeof(holder), vtable_type>;
+    using value_type = GenericWorkStruct<vtable_type>;
+
+    static_assert(sizeof(holder) <= sizeof(true_value_type::obj),
+        "holder must fit in WorkStruct::obj");
+    static_assert(std::is_standard_layout<true_value_type>::value,
+        "WorkStruct must be a standard layout type");
+    static_assert(std::is_standard_layout<value_type>::value,
+        "GenericWorkStruct must be a standard layout type");
+    static_assert(offsetof(value_type, obj) == offsetof(true_value_type, obj),
+        "WorkStruct and GenericWorkStruct must have obj at the same offset");
+    static_assert(sizeof(value_type) <= sizeof(true_value_type),
+        "WorkStruct must not be smaller than GenericWorkStruct");
+
+    true_value_type* value_ptr = static_cast<true_value_type*>(ptr);
+
+    value_ptr->vtable = vtable;
+    value_ptr->call_function_ptr = vtable->call_function_ptr;
+    new(&value_ptr->obj) holder(std::forward<holder_ctor_args>(ctor_args)...);
+  }
+
+  // move construct in dst from the value in src and destroy the value in src
+  static RAJA_INLINE
+  void move_destroy(WorkStruct* value_dst,
+                    WorkStruct* value_src)
+  {
+    value_dst->vtable = value_src->vtable;
+    value_dst->call_function_ptr = value_src->call_function_ptr;
+    value_dst->vtable->move_construct_destroy_function_ptr(&value_dst->obj, &value_src->obj);
+  }
+
+  // destroy the value ptr
+  static RAJA_INLINE
+  void destroy(WorkStruct* value_ptr)
+  {
+    value_ptr->vtable->destroy_function_ptr(&value_ptr->obj);
+  }
+
+  // call the call operator of the value ptr with args
+  static RAJA_HOST_DEVICE RAJA_INLINE
+  void call(const WorkStruct* value_ptr, CallArgs... args)
+  {
+    value_ptr->call_function_ptr(&value_ptr->obj, std::forward<CallArgs>(args)...);
+  }
+
+  const vtable_type* vtable;
+  typename vtable_type::call_sig call_function_ptr;
+  typename std::aligned_storage<size, alignof(std::max_align_t)>::type obj;
+};
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/atomic.hpp b/include/RAJA/pattern/atomic.hpp
index fe9ff79505..3e9e7ffea5 100644
--- a/include/RAJA/pattern/atomic.hpp
+++ b/include/RAJA/pattern/atomic.hpp
@@ -303,7 +303,7 @@ class AtomicRef
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  value_type *getPointer() const { return m_value_ptr; }
+  value_type volatile * getPointer() const { return m_value_ptr; }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
diff --git a/include/RAJA/pattern/detail/algorithm.hpp b/include/RAJA/pattern/detail/algorithm.hpp
new file mode 100644
index 0000000000..eb78cc959d
--- /dev/null
+++ b/include/RAJA/pattern/detail/algorithm.hpp
@@ -0,0 +1,101 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file for RAJA algorithm definitions.
+ *
+ *          Definitions in this file will propagate to all RAJA header files.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_pattern_detail_algorithm_HPP
+#define RAJA_pattern_detail_algorithm_HPP
+
+#include "RAJA/config.hpp"
+#include "RAJA/util/macros.hpp"
+#include "camp/helpers.hpp"
+
+#include <iterator>
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+template <typename Iter>
+using IterVal = typename ::std::iterator_traits<Iter>::value_type;
+
+template <typename Iter>
+using IterRef = typename ::std::iterator_traits<Iter>::reference;
+
+template <typename Iter>
+using IterDiff = typename ::std::iterator_traits<Iter>::difference_type;
+
+template <typename Container>
+using ContainerVal =
+    camp::decay<decltype(*camp::val<camp::iterator_from<Container>>())>;
+
+template <typename DiffType, typename CountType>
+RAJA_INLINE
+DiffType firstIndex(DiffType n, CountType num_threads, CountType thread_id)
+{
+  return (static_cast<size_t>(n) * thread_id) / num_threads;
+}
+
+}  // end namespace detail
+
+
+/*!
+    \brief swap values at iterators lhs and rhs
+*/
+template <typename Iter>
+RAJA_HOST_DEVICE RAJA_INLINE
+void
+safe_iter_swap(Iter lhs, Iter rhs)
+{
+#ifdef RAJA_DEVICE_CODE
+  using camp::safe_swap;
+  safe_swap(*lhs, *rhs);
+#else
+  using std::iter_swap;
+  iter_swap(lhs, rhs);
+#endif
+}
+
+/*!
+    \brief returns iterator to next item
+*/
+template <typename Iter>
+RAJA_HOST_DEVICE RAJA_INLINE
+Iter
+next(Iter it)
+{
+  ++it;
+  return it;
+}
+
+/*!
+    \brief returns iterator to next item
+*/
+template <typename Iter>
+RAJA_HOST_DEVICE RAJA_INLINE
+Iter
+prev(Iter it)
+{
+  --it;
+  return it;
+}
+
+}  // end namespace RAJA
+
+#endif
diff --git a/include/RAJA/pattern/detail/reduce.hpp b/include/RAJA/pattern/detail/reduce.hpp
index c168db9d4e..1adddad5dd 100644
--- a/include/RAJA/pattern/detail/reduce.hpp
+++ b/include/RAJA/pattern/detail/reduce.hpp
@@ -46,7 +46,9 @@
   RAJA_DECLARE_REDUCER(Min, POL, COMBINER)             \
   RAJA_DECLARE_REDUCER(Max, POL, COMBINER)             \
   RAJA_DECLARE_INDEX_REDUCER(MinLoc, POL, COMBINER)    \
-  RAJA_DECLARE_INDEX_REDUCER(MaxLoc, POL, COMBINER)
+  RAJA_DECLARE_INDEX_REDUCER(MaxLoc, POL, COMBINER)    \
+  RAJA_DECLARE_REDUCER(BitOr, POL, COMBINER)           \
+  RAJA_DECLARE_REDUCER(BitAnd, POL, COMBINER)
 
 namespace RAJA
 {
@@ -88,6 +90,15 @@ template <typename T>
 struct max : detail::op_adapter<T, RAJA::operators::maximum> {
 };
 
+template <typename T>
+struct or_bit : detail::op_adapter<T, RAJA::operators::bit_or> {
+};
+
+template <typename T>
+struct and_bit : detail::op_adapter<T, RAJA::operators::bit_and> {
+};
+
+
 #if defined(RAJA_RAJA_ENABLE_TARGET_OPENMP)
 #pragma omp end declare target
 #endif
@@ -117,7 +128,7 @@ class ValueLoc
   T val = doing_min ? operators::limits<T>::max() : operators::limits<T>::min();
   IndexType loc = DefaultLoc<IndexType>().value();
 
-#if __NVCC__ && defined(CUDART_VERSION) && CUDART_VERSION < 9020
+#if __NVCC__ && defined(CUDART_VERSION) && CUDART_VERSION < 9020 || defined(__HIPCC__)
   RAJA_HOST_DEVICE constexpr ValueLoc() {}
   RAJA_HOST_DEVICE constexpr ValueLoc(ValueLoc const &other) : val{other.val}, loc{other.loc} {}
   RAJA_HOST_DEVICE
@@ -128,9 +139,9 @@ class ValueLoc
   ValueLoc &operator=(ValueLoc const &other) = default;
 #endif
 
-  RAJA_HOST_DEVICE constexpr ValueLoc(T const &val) : val{val}, loc{DefaultLoc<IndexType>().value()} {}
-  RAJA_HOST_DEVICE constexpr ValueLoc(T const &val, IndexType const &loc)
-      : val{val}, loc{loc}
+  RAJA_HOST_DEVICE constexpr ValueLoc(T const &val_) : val{val_}, loc{DefaultLoc<IndexType>().value()} {}
+  RAJA_HOST_DEVICE constexpr ValueLoc(T const &val_, IndexType const &loc_)
+      : val{val_}, loc{loc_}
   {
   }
 
@@ -315,6 +326,7 @@ class BaseReduceMin : public BaseReduce<T, RAJA::reduce::min, Combiner>
   using Base::Base;
 
   //! reducer function; updates the current instance's state
+  RAJA_HOST_DEVICE
   const BaseReduceMin &min(T rhs) const
   {
     this->combine(rhs);
@@ -336,22 +348,31 @@ class BaseReduceMinLoc
 public:
   using Base = BaseReduce<ValueLoc<T, IndexType>, RAJA::reduce::min, Combiner>;
   using value_type = typename Base::value_type;
+  using reduce_type = typename Base::reduce_type;
   using Base::Base;
 
   constexpr BaseReduceMinLoc() : Base(value_type(T(), IndexType())) {}
 
-  constexpr BaseReduceMinLoc(T init_val, IndexType init_idx)
-      : Base(value_type(init_val, init_idx))
+  constexpr BaseReduceMinLoc(T init_val, IndexType init_idx,
+                             T identity_ = reduce_type::identity())
+    : Base(value_type(init_val, init_idx), identity_)
   {
   }
 
   /// \brief reducer function; updates the current instance's state
+  RAJA_HOST_DEVICE
   const BaseReduceMinLoc &minloc(T rhs, IndexType loc) const
   {
     this->combine(value_type(rhs, loc));
     return *this;
   }
 
+  void reset(T init_val, IndexType init_idx=DefaultLoc<IndexType>().value(),
+             T identity_ = reduce_type::identity())
+  {
+    Base::reset(value_type(init_val, init_idx), identity_);
+  }
+
   //! Get the calculated reduced value
   IndexType getLoc() const { return Base::get().getLoc(); }
 
@@ -374,6 +395,7 @@ class BaseReduceMax : public BaseReduce<T, RAJA::reduce::max, Combiner>
   using Base::Base;
 
   //! reducer function; updates the current instance's state
+  RAJA_HOST_DEVICE
   const BaseReduceMax &max(T rhs) const
   {
     this->combine(rhs);
@@ -405,6 +427,55 @@ class BaseReduceSum : public BaseReduce<T, RAJA::reduce::sum, Combiner>
   }
 };
 
+/*!
+ **************************************************************************
+ *
+ * \brief  Bitwise OR reducer class template for use in tbb execution.
+ *
+ **************************************************************************
+ */
+template <typename T, template <typename, typename> class Combiner>
+class BaseReduceBitOr : public BaseReduce<T, RAJA::reduce::or_bit, Combiner>
+{
+public:
+  using Base = BaseReduce<T, RAJA::reduce::or_bit, Combiner>;
+  using Base::Base;
+
+  //! reducer function; updates the current instance's state
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  const BaseReduceBitOr &operator|=(T rhs) const
+  {
+    this->combine(rhs);
+    return *this;
+  }
+};
+
+/*!
+ **************************************************************************
+ *
+ * \brief  Bitwise AND reducer class template for use in tbb execution.
+ *
+ **************************************************************************
+ */
+template <typename T, template <typename, typename> class Combiner>
+class BaseReduceBitAnd : public BaseReduce<T, RAJA::reduce::and_bit, Combiner>
+{
+public:
+  using Base = BaseReduce<T, RAJA::reduce::and_bit, Combiner>;
+  using Base::Base;
+
+  //! reducer function; updates the current instance's state
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  const BaseReduceBitAnd &operator&=(T rhs) const
+  {
+    this->combine(rhs);
+    return *this;
+  }
+};
+
+
 /*!
  **************************************************************************
  *
@@ -419,22 +490,31 @@ class BaseReduceMaxLoc
 public:
   using Base = BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>;
   using value_type = typename Base::value_type;
+  using reduce_type = typename Base::reduce_type;
   using Base::Base;
 
   constexpr BaseReduceMaxLoc() : Base(value_type(T(), IndexType())) {}
 
-  constexpr BaseReduceMaxLoc(T init_val, IndexType init_idx)
-      : Base(value_type(init_val, init_idx))
+  constexpr BaseReduceMaxLoc(T init_val, IndexType init_idx,
+                             T identity_ = reduce_type::identity())
+    : Base(value_type(init_val, init_idx), identity_)
   {
   }
 
   //! reducer function; updates the current instance's state
+  RAJA_HOST_DEVICE
   const BaseReduceMaxLoc &maxloc(T rhs, IndexType loc) const
   {
     this->combine(value_type(rhs, loc));
     return *this;
   }
 
+  void reset(T init_val, IndexType init_idx=DefaultLoc<IndexType>().value(),
+             T identity_ = reduce_type::identity())
+  {
+    Base::reset(value_type(init_val, init_idx), identity_);
+  }
+
   //! Get the calculated reduced value
   IndexType getLoc() const { return Base::get().getLoc(); }
 
diff --git a/include/RAJA/pattern/forall.hpp b/include/RAJA/pattern/forall.hpp
index 7f6c8a1c68..4e9fbfd324 100644
--- a/include/RAJA/pattern/forall.hpp
+++ b/include/RAJA/pattern/forall.hpp
@@ -59,9 +59,9 @@
 #include <type_traits>
 
 #include "RAJA/internal/Iterators.hpp"
-#include "RAJA/internal/Span.hpp"
 
 #include "RAJA/policy/PolicyBase.hpp"
+#include "RAJA/policy/MultiPolicy.hpp"
 
 #include "RAJA/index/IndexSet.hpp"
 #include "RAJA/index/ListSegment.hpp"
@@ -70,6 +70,7 @@
 #include "RAJA/internal/fault_tolerance.hpp"
 
 #include "RAJA/util/concepts.hpp"
+#include "RAJA/util/Span.hpp"
 #include "RAJA/util/types.hpp"
 
 #include "RAJA/policy/sequential/forall.hpp"
@@ -80,6 +81,7 @@
 #include "RAJA/internal/get_platform.hpp"
 #include "RAJA/util/plugins.hpp"
 
+#include "RAJA/util/resource.hpp"
 
 namespace RAJA
 {
@@ -92,18 +94,6 @@ namespace RAJA
 //////////////////////////////////////////////////////////////////////
 //
 
-namespace internal
-{
-
-template <typename T>
-auto trigger_updates_before(T&& item) -> typename std::remove_reference<T>::type
-{
-  return item;
-}
-
-
-}  // end namespace internal
-
 namespace detail
 {
 /// Adapter to replace specific implementations for the icount variants
@@ -130,15 +120,15 @@ struct icount_adapter {
 };
 
 struct CallForall {
-  template <typename T, typename ExecPol, typename Body>
-  RAJA_INLINE void operator()(T const&, ExecPol, Body) const;
+  template <typename T, typename ExecPol, typename Body, typename Res>
+  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&, ExecPol, Body, Res&) const;
 };
 
 struct CallForallIcount {
   constexpr CallForallIcount(int s);
 
-  template <typename T, typename ExecPol, typename Body>
-  RAJA_INLINE void operator()(T const&, ExecPol, Body) const;
+  template <typename T, typename ExecPol, typename Body, typename Res>
+  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&, ExecPol, Body, Res&) const;
 
   const int start;
 };
@@ -162,21 +152,21 @@ namespace wrap
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Container, typename LoopBody>
-RAJA_INLINE concepts::enable_if<
+template <typename Res, typename ExecutionPolicy, typename Container, typename LoopBody>
+RAJA_INLINE concepts::enable_if_t<
+    RAJA::resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
     type_traits::is_range<Container>>
-forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
+forall(Res &r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
-
-  using RAJA::internal::trigger_updates_before;
-  auto body = trigger_updates_before(loop_body);
-
-  forall_impl(std::forward<ExecutionPolicy>(p),
-              std::forward<Container>(c),
-              body);
+  RAJA_FORCEINLINE_RECURSIVE
+  return forall_impl(r,
+                     std::forward<ExecutionPolicy>(p),
+                     std::forward<Container>(c),
+                     std::forward<LoopBody>(loop_body));
 }
 
+
 /*!
  ******************************************************************************
  *
@@ -184,27 +174,27 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy,
+template <typename Res,
+          typename ExecutionPolicy,
           typename Container,
           typename IndexType,
           typename LoopBody>
-RAJA_INLINE void forall_Icount(ExecutionPolicy&& p,
-                               Container&& c,
-                               IndexType&& icount,
-                               LoopBody&& loop_body)
+RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res &r,
+                                                      ExecutionPolicy&& p,
+                                                      Container&& c,
+                                                      IndexType&& icount,
+                                                      LoopBody&& loop_body)
 {
-  using RAJA::internal::trigger_updates_before;
-  auto body = trigger_updates_before(loop_body);
-
   using std::begin;
   using std::distance;
   using std::end;
   auto range = RangeSegment(0, distance(begin(c), end(c)));
   detail::icount_adapter<Container, LoopBody, IndexType> adapted(c,
-                                                                 body,
+                                                                 loop_body,
                                                                  icount);
   using policy::sequential::forall_impl;
-  forall_impl(std::forward<ExecutionPolicy>(p), range, adapted);
+  RAJA_FORCEINLINE_RECURSIVE
+  return forall_impl(r, std::forward<ExecutionPolicy>(p), range, adapted);
 }
 
 /*!
@@ -216,256 +206,326 @@ RAJA_INLINE void forall_Icount(ExecutionPolicy&& p,
 *
 ******************************************************************************
 */
-template <typename SegmentIterPolicy,
+template <typename Res,
+          typename SegmentIterPolicy,
           typename SegmentExecPolicy,
           typename... SegmentTypes,
           typename LoopBody>
-RAJA_INLINE void forall_Icount(ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
-                               const TypedIndexSet<SegmentTypes...>& iset,
-                               LoopBody loop_body)
+RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res&r,
+                                                ExecPolicy<SegmentIterPolicy,
+                                                SegmentExecPolicy>,
+                                                const TypedIndexSet<SegmentTypes...>& iset,
+                                                LoopBody loop_body)
 {
-
-  using RAJA::internal::trigger_updates_before;
-  auto body = trigger_updates_before(loop_body);
-
   // no need for icount variant here
-  wrap::forall(SegmentIterPolicy(), iset, [=](int segID) {
+  auto segIterRes = resources::get_resource<SegmentIterPolicy>::type::get_default();
+  wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) {
     iset.segmentCall(segID,
                      detail::CallForallIcount(iset.getStartingIcount(segID)),
                      SegmentExecPolicy(),
-                     body);
+                     loop_body,
+                     r);
   });
+  return RAJA::resources::EventProxy<Res>(&r);
 }
 
-template <typename SegmentIterPolicy,
+template <typename Res,
+          typename SegmentIterPolicy,
           typename SegmentExecPolicy,
           typename LoopBody,
           typename... SegmentTypes>
-RAJA_INLINE void forall(ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
-                        const TypedIndexSet<SegmentTypes...>& iset,
-                        LoopBody loop_body)
+RAJA_INLINE resources::EventProxy<Res> forall(Res &r,
+                                         ExecPolicy<SegmentIterPolicy,
+                                         SegmentExecPolicy>,
+                                         const TypedIndexSet<SegmentTypes...>& iset,
+                                         LoopBody loop_body)
 {
-
-  using RAJA::internal::trigger_updates_before;
-  auto body = trigger_updates_before(loop_body);
-
-  wrap::forall(SegmentIterPolicy(), iset, [=](int segID) {
-    iset.segmentCall(segID, detail::CallForall{}, SegmentExecPolicy(), body);
+  auto segIterRes = resources::get_resource<SegmentIterPolicy>::type::get_default();
+  wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) {
+    iset.segmentCall(segID, detail::CallForall{}, SegmentExecPolicy(), loop_body, r);
   });
+  return RAJA::resources::EventProxy<Res>(&r);
 }
 
 }  // end namespace wrap
 
+
+
 /*!
  ******************************************************************************
  *
- * \brief Generic dispatch over  with icount
+ * \brief The RAJA::policy_by_value_interface forall functions provide an interface with
+ *        value-based policies. It also enforces the interface and performs
+ *        static checks as well as triggering plugins and loop body updates.
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename IdxSet, typename LoopBody>
-RAJA_INLINE void forall_Icount(ExecutionPolicy&& p,
-                               IdxSet&& c,
-                               LoopBody&& loop_body)
+inline namespace policy_by_value_interface
 {
-  static_assert(type_traits::is_index_set<IdxSet>::value,
-                "Expected an TypedIndexSet but did not get one. Are you using "
-                "an "
-                "TypedIndexSet policy by mistake?");
-
-  util::PluginContext context{util::make_context<ExecutionPolicy>()};
-  util::callPreLaunchPlugins(context); 
 
-  wrap::forall_Icount(std::forward<ExecutionPolicy>(p),
-                      std::forward<IdxSet>(c),
-                      std::forward<LoopBody>(loop_body));
-
-  util::callPostLaunchPlugins(context);
-
-}
 
 /*!
  ******************************************************************************
  *
- * \brief Generic dispatch over  with icount
+ * \brief Generic dispatch over index set with icount with a value-based policy
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename IdxSet, typename LoopBody>
-RAJA_INLINE concepts::enable_if<
-    type_traits::is_indexset_policy<ExecutionPolicy>>
-forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
+template <typename ExecutionPolicy, typename IdxSet, typename LoopBody,
+          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
+                                                     IdxSet&& c,
+                                                     LoopBody&& loop_body)
+{
+  auto r = Res::get_default();
+  return forall_Icount(std::forward<ExecutionPolicy>(p),
+                       r,
+                       std::forward<IdxSet>(c),
+                       std::forward<LoopBody>(loop_body));
+}
+template <typename ExecutionPolicy, typename Res, typename IdxSet, typename LoopBody>
+RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
+                                                     Res &r,
+                                                     IdxSet&& c,
+                                                     LoopBody&& loop_body)
 {
   static_assert(type_traits::is_index_set<IdxSet>::value,
-                "Expected an TypedIndexSet but did not get one. Are you using "
-                "an "
-                "TypedIndexSet policy by mistake?");
+                "Expected a TypedIndexSet but did not get one. Are you using "
+                "a TypedIndexSet policy by mistake?");
 
-  util::PluginContext context{util::make_context<ExecutionPolicy>()};
-  util::callPreLaunchPlugins(context); 
+  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::callPreCapturePlugins(context);
 
+  using RAJA::util::trigger_updates_before;
+  auto body = trigger_updates_before(loop_body);
 
-  wrap::forall(std::forward<ExecutionPolicy>(p),
-               std::forward<IdxSet>(c),
-               std::forward<LoopBody>(loop_body));
+  util::callPostCapturePlugins(context);
 
-  util::callPostLaunchPlugins(context);
+  util::callPreLaunchPlugins(context);
+
+  RAJA::resources::EventProxy<Res> e = wrap::forall_Icount(r,
+                                  std::forward<ExecutionPolicy>(p),
+                                  std::forward<IdxSet>(c),
+                                  std::move(body));
 
+  util::callPostLaunchPlugins(context);
+  return e;
 }
 
 /*!
  ******************************************************************************
  *
- * \brief Generic dispatch over containers with icount
+ * \brief Generic dispatch over index set with a value-based policy
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy,
-          typename Container,
-          typename IndexType,
-          typename LoopBody>
-RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>,
-                                type_traits::is_integral<IndexType>>
-forall_Icount(ExecutionPolicy&& p,
-              Container&& c,
-              IndexType icount,
-              LoopBody&& loop_body)
+template <typename ExecutionPolicy, typename IdxSet, typename LoopBody,
+          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_indexset_policy<ExecutionPolicy>>
+forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
 {
-  static_assert(type_traits::is_random_access_range<Container>::value,
-                "Container does not model RandomAccessIterator");
+  auto r = Res::get_default();
+  return forall(std::forward<ExecutionPolicy>(p),
+                r,
+                std::forward<IdxSet>(c),
+                std::forward<LoopBody>(loop_body));
+}
+template <typename ExecutionPolicy, typename Res, typename IdxSet, typename LoopBody>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_indexset_policy<ExecutionPolicy>>
+forall(ExecutionPolicy&& p, Res &r, IdxSet&& c, LoopBody&& loop_body)
+{
+  static_assert(type_traits::is_index_set<IdxSet>::value,
+                "Expected a TypedIndexSet but did not get one. Are you using "
+                "a TypedIndexSet policy by mistake?");
 
-  util::PluginContext context{util::make_context<ExecutionPolicy>()};
-  util::callPreLaunchPlugins(context); 
+  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::callPreCapturePlugins(context);
 
-  wrap::forall_Icount(std::forward<ExecutionPolicy>(p),
-                      std::forward<Container>(c),
-                      icount,
-                      std::forward<LoopBody>(loop_body));
+  using RAJA::util::trigger_updates_before;
+  auto body = trigger_updates_before(loop_body);
 
-  util::callPostLaunchPlugins(context);
+  util::callPostCapturePlugins(context);
 
-}
+  util::callPreLaunchPlugins(context);
 
+  resources::EventProxy<Res> e = wrap::forall(r,
+                                         std::forward<ExecutionPolicy>(p),
+                                         std::forward<IdxSet>(c),
+                                         std::move(body));
+
+  util::callPostLaunchPlugins(context);
+  return e;
+}
 
 /*!
  ******************************************************************************
  *
- * \brief Generic dispatch over containers with a value-based policy
+ * \brief Generic dispatch over containers with a multi policy
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Container, typename LoopBody>
+template <typename ExecutionPolicy, typename Container, typename LoopBody,
+          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
 RAJA_INLINE concepts::enable_if<
-    concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
+    type_traits::is_multi_policy<ExecutionPolicy>,
     type_traits::is_range<Container>>
 forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container does not model RandomAccessIterator");
 
-  util::PluginContext context{util::make_context<ExecutionPolicy>()};
-  util::callPreLaunchPlugins(context); 
+  auto r = Res::get_default();
 
-  wrap::forall(std::forward<ExecutionPolicy>(p),
-               std::forward<Container>(c),
-               std::forward<LoopBody>(loop_body));
-
-  util::callPostLaunchPlugins(context);
+  // plugins handled in multipolicy policy_invoker
+  forall_impl(r,
+              std::forward<ExecutionPolicy>(p),
+              std::forward<Container>(c),
+              std::forward<LoopBody>(loop_body));
 }
 
-//
-//////////////////////////////////////////////////////////////////////
-//
-// Function templates that iterate over indirection arrays.
-//
-//////////////////////////////////////////////////////////////////////
-//
-
 /*!
  ******************************************************************************
  *
- * \brief  Generic iteration over indices in indirection array.
+ * \brief Generic dispatch over containers with icount with a value-based policy
  *
  ******************************************************************************
  */
 template <typename ExecutionPolicy,
-          typename ArrayIdxType,
+          typename Container,
+          typename IndexType,
+          typename LoopBody,
+          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_range<Container>,
+    concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
+    type_traits::is_integral<IndexType>>
+forall_Icount(ExecutionPolicy&& p,
+              Container&& c,
+              IndexType icount,
+              LoopBody&& loop_body)
+{
+  auto r = Res::get_default();
+  return forall_Icount(std::forward<ExecutionPolicy>(p),
+                       r,
+                       std::forward<Container>(c),
+                       icount,
+                       std::forward<LoopBody>(loop_body));
+}
+template <typename ExecutionPolicy,
+          typename Res,
+          typename Container,
           typename IndexType,
           typename LoopBody>
-RAJA_INLINE concepts::enable_if<
-    type_traits::is_integral<IndexType>,
-    concepts::negate<type_traits::is_iterator<IndexType>>>
-forall(ExecutionPolicy&& p,
-       const ArrayIdxType* idx,
-       const IndexType len,
-       LoopBody&& loop_body)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_range<Container>,
+    type_traits::is_integral<IndexType>>
+forall_Icount(ExecutionPolicy&& p,
+              Res& r,
+              Container&& c,
+              IndexType icount,
+              LoopBody&& loop_body)
 {
-  util::PluginContext context{util::make_context<ExecutionPolicy>()};
-  util::callPreLaunchPlugins(context); 
+  static_assert(type_traits::is_random_access_range<Container>::value,
+                "Container does not model RandomAccessIterator");
 
-  wrap::forall(std::forward<ExecutionPolicy>(p),
-               TypedListSegment<ArrayIdxType>(idx, len, Unowned),
-               std::forward<LoopBody>(loop_body));
+  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::callPreCapturePlugins(context);
 
-  util::callPostLaunchPlugins(context);
+  using RAJA::util::trigger_updates_before;
+  auto body = trigger_updates_before(loop_body);
 
+  util::callPostCapturePlugins(context);
+
+  util::callPreLaunchPlugins(context);
+
+  resources::EventProxy<Res> e =  wrap::forall_Icount(r,
+                                                 std::forward<ExecutionPolicy>(p),
+                                                 std::forward<Container>(c),
+                                                 icount,
+                                                 std::move(body));
+
+  util::callPostLaunchPlugins(context);
+  return e;
 }
 
 /*!
  ******************************************************************************
  *
- * \brief  Generic iteration over indices in indirection array with index count.
- *
- *         NOTE: lambda loop body requires two args (icount, index).
+ * \brief Generic dispatch over containers with a value-based policy
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy,
-          typename ArrayIdxType,
-          typename IndexType,
-          typename OffsetType,
-          typename LoopBody>
-RAJA_INLINE concepts::enable_if<
-    type_traits::is_integral<IndexType>,
-    concepts::negate<type_traits::is_iterator<IndexType>>,
-    type_traits::is_integral<OffsetType>,
-    concepts::negate<type_traits::is_iterator<OffsetType>>,
-    type_traits::is_integral<ArrayIdxType>,
-    concepts::negate<type_traits::is_iterator<ArrayIdxType>>>
-forall_Icount(ExecutionPolicy&& p,
-              const ArrayIdxType* idx,
-              const IndexType len,
-              const OffsetType icount,
-              LoopBody&& loop_body)
+template <typename ExecutionPolicy, typename Container, typename LoopBody,
+          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
+    concepts::negate<type_traits::is_multi_policy<ExecutionPolicy>>,
+    type_traits::is_range<Container>>
+forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
-  util::PluginContext context{util::make_context<ExecutionPolicy>()};
-  util::callPreLaunchPlugins(context); 
-
-  // turn into an iterator
-  forall_Icount(std::forward<ExecutionPolicy>(p),
-                TypedListSegment<ArrayIdxType>(idx, len, Unowned),
-                icount,
+  auto r = Res::get_default();
+  return forall(std::forward<ExecutionPolicy>(p),
+                r,
+                std::forward<Container>(c),
                 std::forward<LoopBody>(loop_body));
+}
 
-  util::callPostLaunchPlugins(context);
+template <typename ExecutionPolicy, typename Res, typename Container, typename LoopBody>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
+    concepts::negate<type_traits::is_multi_policy<ExecutionPolicy>>,
+    type_traits::is_range<Container>>
+forall(ExecutionPolicy&& p, Res &r, Container&& c, LoopBody&& loop_body)
+{
+  static_assert(type_traits::is_random_access_range<Container>::value,
+                "Container does not model RandomAccessIterator");
+
+  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::callPreCapturePlugins(context);
+
+  using RAJA::util::trigger_updates_before;
+  auto body = trigger_updates_before(loop_body);
+
+  util::callPostCapturePlugins(context);
 
+  util::callPreLaunchPlugins(context);
+
+  resources::EventProxy<Res> e =  wrap::forall(r,
+                                          std::forward<ExecutionPolicy>(p),
+                                          std::forward<Container>(c),
+                                          std::move(body));
+
+  util::callPostLaunchPlugins(context);
+  return e;
 }
 
+}  // end inline namespace policy_by_value_interface
+
+
 /*!
  * \brief Conversion from template-based policy to value-based policy for forall
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecutionPolicy, typename... Args>
-RAJA_INLINE void forall(Args&&... args)
+template <typename ExecutionPolicy, typename... Args, 
+          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+RAJA_INLINE resources::EventProxy<Res> forall(Args&&... args)
 {
-  util::PluginContext context{util::make_context<ExecutionPolicy>()};
-  util::callPreLaunchPlugins(context); 
-
-  RAJA_FORCEINLINE_RECURSIVE
-  wrap::forall(ExecutionPolicy(), std::forward<Args>(args)...);
-
-  util::callPostLaunchPlugins(context);
+  Res r = Res::get_default();
+  return forall<ExecutionPolicy>(r, std::forward<Args>(args)...);
+}
+template <typename ExecutionPolicy, typename Res, typename... Args>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>, type_traits::is_resource<Res>> forall(Res &r, Args&&... args)
+{
+  return policy_by_value_interface::forall(ExecutionPolicy(), r, std::forward<Args>(args)...);
 }
 
 /*!
@@ -474,39 +534,44 @@ RAJA_INLINE void forall(Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecutionPolicy, typename... Args>
-RAJA_INLINE void forall_Icount(Args&&... args)
+template <typename ExecutionPolicy, typename... Args, 
+          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+RAJA_INLINE resources::EventProxy<Res> forall_Icount(Args&&... args)
 {
-  util::PluginContext context{util::make_context<ExecutionPolicy>()};
-  util::callPreLaunchPlugins(context); 
-
-  forall_Icount(ExecutionPolicy(), std::forward<Args>(args)...);
-
-  util::callPostLaunchPlugins(context);
+  Res r = Res::get_default();
+  return forall_Icount<ExecutionPolicy>(r, std::forward<Args>(args)...);
+}
+template <typename ExecutionPolicy, typename Res, typename... Args>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>, type_traits::is_resource<Res>> forall_Icount(Res &r, Args&&... args)
+{
+  return policy_by_value_interface::forall_Icount(ExecutionPolicy(), r, std::forward<Args>(args)...);
 }
 
 namespace detail
 {
 
-template <typename T, typename ExecutionPolicy, typename LoopBody>
-RAJA_INLINE void CallForall::operator()(T const& segment,
-                                        ExecutionPolicy,
-                                        LoopBody body) const
+template <typename T, typename ExecutionPolicy, typename LoopBody, typename Res>
+RAJA_INLINE camp::resources::EventProxy<Res> CallForall::operator()(T const& segment,
+                                                               ExecutionPolicy,
+                                                               LoopBody body,
+                                                               Res &r) const
 {
   // this is only called inside a region, use impl
   using policy::sequential::forall_impl;
-  forall_impl(ExecutionPolicy(), segment, body);
+  RAJA_FORCEINLINE_RECURSIVE
+  return forall_impl(r, ExecutionPolicy(), segment, body);
 }
 
 constexpr CallForallIcount::CallForallIcount(int s) : start(s) {}
 
-template <typename T, typename ExecutionPolicy, typename LoopBody>
-RAJA_INLINE void CallForallIcount::operator()(T const& segment,
-                                              ExecutionPolicy,
-                                              LoopBody body) const
+template <typename T, typename ExecutionPolicy, typename LoopBody, typename Res>
+RAJA_INLINE camp::resources::EventProxy<Res> CallForallIcount::operator()(T const& segment,
+                                                                     ExecutionPolicy,
+                                                                     LoopBody body,
+                                                                     Res &r) const
 {
   // go through wrap to unwrap icount
-  wrap::forall_Icount(ExecutionPolicy(), segment, start, body);
+  return wrap::forall_Icount(r, ExecutionPolicy(), segment, start, body);
 }
 
 }  // namespace detail
diff --git a/include/RAJA/pattern/kernel.hpp b/include/RAJA/pattern/kernel.hpp
index b3769a5aaa..f262288351 100644
--- a/include/RAJA/pattern/kernel.hpp
+++ b/include/RAJA/pattern/kernel.hpp
@@ -58,8 +58,8 @@ template <typename... Ts>
 struct IterableWrapperTuple<camp::tuple<Ts...>> {
 
   using type =
-      camp::tuple<RAJA::impl::Span<typename camp::decay<Ts>::iterator,
-                                   typename camp::decay<Ts>::IndexType>...>;
+      camp::tuple<RAJA::Span<typename camp::decay<Ts>::iterator,
+                             typename camp::decay<Ts>::IndexType>...>;
 };
 
 
@@ -68,14 +68,14 @@ namespace internal
 template <class Tuple, camp::idx_t... I>
 RAJA_INLINE constexpr auto make_wrapped_tuple_impl(Tuple &&t,
                                                    camp::idx_seq<I...>)
-    -> camp::tuple<RAJA::impl::Span<
+    -> camp::tuple<RAJA::Span<
         typename camp::decay<
             camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
         typename camp::decay<
             camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType>...>
 {
   return camp::make_tuple(
-      RAJA::impl::Span<
+      RAJA::Span<
           typename camp::decay<
               camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
           typename camp::decay<camp::tuple_element_t<I, camp::decay<Tuple>>>::
@@ -105,7 +105,6 @@ RAJA_INLINE void kernel_param(SegmentTuple &&segments,
                               Bodies &&... bodies)
 {
   util::PluginContext context{util::make_context<PolicyType>()};
-  util::callPreLaunchPlugins(context); 
 
   // TODO: test that all policy members model the Executor policy concept
   // TODO: add a static_assert for functors which cannot be invoked with
@@ -118,12 +117,13 @@ RAJA_INLINE void kernel_param(SegmentTuple &&segments,
 
   using param_tuple_t = camp::decay<ParamTuple>;
 
-  using loop_data_t = internal::LoopData<PolicyType,
-                                         segment_tuple_t,
+  using loop_data_t = internal::LoopData<segment_tuple_t,
                                          param_tuple_t,
                                          camp::decay<Bodies>...>;
 
 
+  util::callPreCapturePlugins(context);
+
   // Create the LoopData object, which contains our policy object,
   // our segments, loop bodies, and the tuple of loop indices
   // it is passed through all of the kernel mechanics by-referenece,
@@ -133,10 +133,15 @@ RAJA_INLINE void kernel_param(SegmentTuple &&segments,
                         std::forward<ParamTuple>(params),
                         std::forward<Bodies>(bodies)...);
 
+  util::callPostCapturePlugins(context);
+
+  using loop_types_t = internal::makeInitialLoopTypes<loop_data_t>;
+
+  util::callPreLaunchPlugins(context);
 
   // Execute!
   RAJA_FORCEINLINE_RECURSIVE
-  internal::execute_statement_list<PolicyType>(loop_data);
+  internal::execute_statement_list<PolicyType, loop_types_t>(loop_data);
 
   util::callPostLaunchPlugins(context);
 }
@@ -160,6 +165,7 @@ RAJA_INLINE void kernel(SegmentTuple &&segments, Bodies &&... bodies)
 #include "RAJA/pattern/kernel/Hyperplane.hpp"
 #include "RAJA/pattern/kernel/InitLocalMem.hpp"
 #include "RAJA/pattern/kernel/Lambda.hpp"
+#include "RAJA/pattern/kernel/Param.hpp"
 #include "RAJA/pattern/kernel/Reduce.hpp"
 #include "RAJA/pattern/kernel/Region.hpp"
 #include "RAJA/pattern/kernel/Tile.hpp"
diff --git a/include/RAJA/pattern/kernel/ArgHelper.hpp b/include/RAJA/pattern/kernel/ArgHelper.hpp
deleted file mode 100644
index a4d65e064c..0000000000
--- a/include/RAJA/pattern/kernel/ArgHelper.hpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file for kernel lambda executor.
- *
- ******************************************************************************
- */
-
-
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-
-#ifndef RAJA_pattern_kernel_arghelper_HPP
-#define RAJA_pattern_kernel_arghelper_HPP
-
-#include "RAJA/config.hpp"
-
-#include <iostream>
-#include <type_traits>
-
-#include "camp/camp.hpp"
-#include "camp/concepts.hpp"
-#include "camp/tuple.hpp"
-
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
-#include "RAJA/pattern/kernel/Param.hpp"
-#include "RAJA/pattern/kernel/LambdaArgs.hpp"
-
-namespace RAJA
-{
-
-namespace internal
-{
-
-using RAJA::statement::seg_t;
-using RAJA::statement::param_t;
-using RAJA::statement::offset_t;
-using RAJA::statement::LambdaArgs;
-
-//Extracts arguments from segments, and parameters
-template<typename T>
-struct extractor;
-
-template<camp::idx_t id>
-struct extractor<LambdaArgs<offset_t, id>>
-{
-
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  static auto extract_arg(Data &&data) ->
-    decltype(camp::get<id>(data.offset_tuple))
-  {
-    return camp::get<id>(data.offset_tuple);
-  }
-
-};
-
-template<camp::idx_t id>
-struct extractor<LambdaArgs<seg_t, id>>
-{
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  static auto extract_arg(Data &&data) ->
-    decltype(camp::get<id>(data.segment_tuple).begin()[camp::get<id>(data.offset_tuple)])
-  {
-    return camp::get<id>(data.segment_tuple).begin()[camp::get<id>(data.offset_tuple)];
-  }
-};
-
-template<camp::idx_t id>
-struct extractor<LambdaArgs<param_t, id>>
-{
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  static auto extract_arg(Data &&data)->
-    typename std::add_lvalue_reference<camp::tuple_element_t<id,typename camp::decay<Data>::param_tuple_t>>::type
-  {
-    return camp::get<id>(data.param_tuple);
-  }
-};
-
-}  // namespace internal
-
-}  // end namespace RAJA
-
-
-#endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/Conditional.hpp b/include/RAJA/pattern/kernel/Conditional.hpp
index fba0b66cfe..08c560939a 100644
--- a/include/RAJA/pattern/kernel/Conditional.hpp
+++ b/include/RAJA/pattern/kernel/Conditional.hpp
@@ -195,8 +195,8 @@ namespace internal
 {
 
 
-template <typename Condition, typename... EnclosedStmts>
-struct StatementExecutor<statement::If<Condition, EnclosedStmts...>> {
+template <typename Condition, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::If<Condition, EnclosedStmts...>, Types> {
 
 
   template <typename Data>
@@ -204,7 +204,7 @@ struct StatementExecutor<statement::If<Condition, EnclosedStmts...>> {
   {
 
     if (Condition::eval(data)) {
-      execute_statement_list<camp::list<EnclosedStmts...>>(
+      execute_statement_list<camp::list<EnclosedStmts...>, Types>(
           std::forward<Data>(data));
     }
   }
diff --git a/include/RAJA/pattern/kernel/For.hpp b/include/RAJA/pattern/kernel/For.hpp
index 283290d572..305a4d6bff 100644
--- a/include/RAJA/pattern/kernel/For.hpp
+++ b/include/RAJA/pattern/kernel/For.hpp
@@ -59,10 +59,10 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  *
  */
-template <camp::idx_t ArgumentId, typename Data, typename... EnclosedStmts>
-struct ForWrapper : public GenericWrapper<Data, EnclosedStmts...> {
+template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
+struct ForWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
 
-  using Base = GenericWrapper<Data, EnclosedStmts...>;
+  using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
   using privatizer = NestedPrivatizer<ForWrapper>;
 
@@ -82,22 +82,28 @@ struct ForWrapper : public GenericWrapper<Data, EnclosedStmts...> {
  */
 template <camp::idx_t ArgumentId,
           typename ExecPolicy,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
-    statement::For<ArgumentId, ExecPolicy, EnclosedStmts...>> {
+    statement::For<ArgumentId, ExecPolicy, EnclosedStmts...>, Types> {
 
 
   template <typename Data>
   static RAJA_INLINE void exec(Data &&data)
   {
 
+    // Set the argument type for this loop
+    using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
     // Create a wrapper, just in case forall_impl needs to thread_privatize
-    ForWrapper<ArgumentId, Data, EnclosedStmts...> for_wrapper(data);
+    ForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...> for_wrapper(data);
 
     auto len = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
-    forall_impl(ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper);
+    auto r = resources::get_resource<ExecPolicy>::type::get_default();
+
+    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper);
   }
 };
 
diff --git a/include/RAJA/pattern/kernel/ForICount.hpp b/include/RAJA/pattern/kernel/ForICount.hpp
index e008998bbb..2782be9efd 100644
--- a/include/RAJA/pattern/kernel/ForICount.hpp
+++ b/include/RAJA/pattern/kernel/ForICount.hpp
@@ -24,6 +24,7 @@
 #include <type_traits>
 
 #include "RAJA/pattern/kernel/internal.hpp"
+#include "RAJA/pattern/kernel/Param.hpp"
 
 namespace RAJA
 {
@@ -63,11 +64,11 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <camp::idx_t ArgumentId, typename ParamId, typename Data,
+template <camp::idx_t ArgumentId, typename ParamId, typename Data, typename Types,
           typename... EnclosedStmts>
-struct ForICountWrapper : public GenericWrapper<Data, EnclosedStmts...> {
+struct ForICountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
 
-  using Base = GenericWrapper<Data, EnclosedStmts...>;
+  using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
   using privatizer = NestedPrivatizer<ForICountWrapper>;
 
@@ -89,23 +90,29 @@ struct ForICountWrapper : public GenericWrapper<Data, EnclosedStmts...> {
 template <camp::idx_t ArgumentId,
           typename ParamId,
           typename ExecPolicy,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
-    statement::ForICount<ArgumentId, ParamId, ExecPolicy, EnclosedStmts...>> {
+    statement::ForICount<ArgumentId, ParamId, ExecPolicy, EnclosedStmts...>, Types> {
 
 
   template <typename Data>
   static RAJA_INLINE void exec(Data &&data)
   {
 
+    // Set the argument type for this loop
+    using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
     // Create a wrapper, just in case forall_impl needs to thread_privatize
-    ForICountWrapper<ArgumentId, ParamId, Data,
+    ForICountWrapper<ArgumentId, ParamId, Data, NewTypes,
                      EnclosedStmts...> for_wrapper(data);
 
     auto len = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
-    forall_impl(ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper);
+    auto r = resources::get_resource<ExecPolicy>::type::get_default();
+
+    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper);
   }
 };
 
diff --git a/include/RAJA/pattern/kernel/Hyperplane.hpp b/include/RAJA/pattern/kernel/Hyperplane.hpp
index 5d9ea9db3d..e79250fc09 100644
--- a/include/RAJA/pattern/kernel/Hyperplane.hpp
+++ b/include/RAJA/pattern/kernel/Hyperplane.hpp
@@ -102,12 +102,13 @@ template <camp::idx_t HpArgumentId,
           typename HpExecPolicy,
           camp::idx_t... Args,
           typename ExecPolicy,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<statement::Hyperplane<HpArgumentId,
                                                HpExecPolicy,
                                                ArgList<Args...>,
                                                ExecPolicy,
-                                               EnclosedStmts...>> {
+                                               EnclosedStmts...>, Types> {
 
 
   template <typename Data>
@@ -119,6 +120,9 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
     using idx_t =
         camp::tuple_element_t<HpArgumentId, typename data_t::offset_tuple_t>;
 
+    // Set the argument type for this loop
+    using NewTypes = setSegmentTypeFromData<Types, HpArgumentId, Data>;
+
     // Add a Collapse policy around our enclosed statements that will handle
     // the inner hyperplane loop's execution
     using kernel_policy = statement::Collapse<
@@ -127,12 +131,12 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
         HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>>;
 
     // Create a For-loop wrapper for the outer loop
-    ForWrapper<HpArgumentId, Data, kernel_policy> outer_wrapper(data);
+    ForWrapper<HpArgumentId, Data, NewTypes, kernel_policy> outer_wrapper(data);
 
     // compute manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
     idx_t hp_len = segment_length<HpArgumentId>(data) +
-                   VarOps::foldl(RAJA::operators::plus<idx_t>(),
+                   foldl(RAJA::operators::plus<idx_t>(),
                                  segment_length<Args>(data)...);
 
     /* Execute the outer loop over hyperplanes
@@ -141,7 +145,8 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
      * later, the HyperplaneInner executor can pull it out, and calculate that
      * arguments actual value (and restrict to valid hyperplane indices)
      */
-    forall_impl(HpExecPolicy{},
+    auto r = resources::get_resource<HpExecPolicy>::type::get_default();
+    forall_impl(r, HpExecPolicy{},
                 TypedRangeSegment<idx_t>(0, hp_len),
                 outer_wrapper);
   }
@@ -150,9 +155,10 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
 
 template <camp::idx_t HpArgumentId,
           camp::idx_t... Args,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
-    HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>> {
+    HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>, Types> {
 
 
   template <typename Data>
@@ -165,7 +171,7 @@ struct StatementExecutor<
 
     // compute actual iterate for HpArgumentId
     // as:  i0 = h - (i1 + i2 + i3 + ...)
-    idx_t i = h - VarOps::foldl(RAJA::operators::plus<idx_t>(),
+    idx_t i = h - foldl(RAJA::operators::plus<idx_t>(),
                                 camp::get<Args>(data.offset_tuple)...);
 
     // get length of Hp indexed argument
@@ -178,7 +184,7 @@ struct StatementExecutor<
       data.template assign_offset<HpArgumentId>(i);
 
       // execute enclosed statements
-      execute_statement_list<StatementList<EnclosedStmts...>>(data);
+      execute_statement_list<StatementList<EnclosedStmts...>, Types>(data);
 
       // reset h for next iteration
       data.template assign_offset<HpArgumentId>(h);
diff --git a/include/RAJA/pattern/kernel/InitLocalMem.hpp b/include/RAJA/pattern/kernel/InitLocalMem.hpp
index cc45555444..b0f85c5bed 100644
--- a/include/RAJA/pattern/kernel/InitLocalMem.hpp
+++ b/include/RAJA/pattern/kernel/InitLocalMem.hpp
@@ -59,14 +59,14 @@ namespace internal
 {
 
 //Statement executor to initalize RAJA local array
-template<camp::idx_t... Indices, typename... EnclosedStmts>
-struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,camp::idx_seq<Indices...>, EnclosedStmts...> >{
+template<camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,camp::idx_seq<Indices...>, EnclosedStmts...>, Types>{
   
   //Execute statement list
   template<class Data>
   static void RAJA_INLINE initMem(Data && data)
   {
-    execute_statement_list<camp::list<EnclosedStmts...>>(data);
+    execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
   }
   
   //Intialize local array
diff --git a/include/RAJA/pattern/kernel/Lambda.hpp b/include/RAJA/pattern/kernel/Lambda.hpp
index 1d752cca80..a1f2bcf621 100644
--- a/include/RAJA/pattern/kernel/Lambda.hpp
+++ b/include/RAJA/pattern/kernel/Lambda.hpp
@@ -30,16 +30,84 @@
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
 
-#include "RAJA/pattern/kernel/ArgHelper.hpp"
 #include "RAJA/pattern/kernel/internal.hpp"
 
 namespace RAJA
 {
 
-namespace statement
+namespace internal
+{
+struct lambda_arg_seg_t
+{};
+
+struct lambda_arg_param_t
+{};
+
+struct lambda_arg_offset_t
+{};
+
+template<typename T>
+struct lambda_arg_value_t
 {
+    using type = T;
+};
+
+template<typename T, camp::idx_t>
+struct LambdaArg
+{
+};
+
+}
+
+
+
+/*!
+ * Used in RAJA::statement::Lambda to specify that one or more segment values
+ * should be passed into the lambda as an argument
+ */
+template<camp::idx_t ... args>
+using Segs = camp::list<internal::LambdaArg<internal::lambda_arg_seg_t, args>...>;
+
+/*!
+ * Used in RAJA::statement::Lambda to specify that one or more segment offsets
+ * should be passed into the lambda as an argument
+ *
+ * The offset is the distance from the segment begin() that is currently being
+ * iterated on.
+ *
+ * In the case of tiling (with Tile) the offset is w.r.t. the beginning of the
+ * current tile.
+ */
+template<camp::idx_t ... args>
+using Offsets = camp::list<internal::LambdaArg<internal::lambda_arg_offset_t, args>...>;
 
 
+/*!
+ * Used in RAJA::statement::Lambda to specify that one or more parameters that
+ * should be passed into the lambda as an argument.
+ */
+template<camp::idx_t ... args>
+using Params = camp::list<internal::LambdaArg<internal::lambda_arg_param_t, args>...>;
+
+/*!
+ * Used in RAJA::statement::Lambda to specify that one or more constant values
+ * should be passed into the lambda as an argument.
+ *
+ * Values specified in the template parameters
+ *
+ * Example:
+ * writing:   Lambda<0, ValuesT<int, 3>>
+ * invokes:   lambda0( (int)3 )
+ *
+ * writing:   Lambda<0, ValuesT<double, 3, 4>>
+ * invokes:   lambda0( (double)3, (double) 4 )
+ */
+template<typename T, camp::idx_t ... values>
+using ValuesT = camp::list<internal::LambdaArg<internal::lambda_arg_value_t<T>, values>...>;
+
+
+namespace statement
+{
 /*!
  * A RAJA::kernel statement that invokes a lambda function.
  *
@@ -52,7 +120,7 @@ namespace statement
  */
 template <camp::idx_t BodyIdx, typename... Args >
 struct Lambda : internal::Statement<camp::nil> {
-  const static camp::idx_t loop_body_index = BodyIdx;
+  static const camp::idx_t loop_body_index = BodyIdx;
 };
 
 }  // end namespace statement
@@ -60,32 +128,163 @@ struct Lambda : internal::Statement<camp::nil> {
 namespace internal
 {
 
-template <camp::idx_t LambdaIndex>
-struct StatementExecutor<statement::Lambda<LambdaIndex>> {
+
+RAJA_SUPPRESS_HD_WARN
+template <camp::idx_t LoopIndex,
+          typename Types,
+          camp::idx_t... OffsetIdx,
+          camp::idx_t... ParamIdx,
+          typename Data>
+RAJA_HOST_DEVICE RAJA_INLINE void invoke_lambda_expanded(
+    camp::idx_seq<OffsetIdx...> const &,
+    camp::idx_seq<ParamIdx...> const &,
+    Data &&data)
+{
+
+  using segment_types = typename Types::segment_types_t;
+
+  // ensure that all segments have been set by a loop
+  static_assert(
+      foldl(RAJA::operators::bit_and<bool>(), (!std::is_same<camp::at_v<segment_types, OffsetIdx>, void>::value)...),
+      "Not all segments have been used in a loop:  Either add more loop statements, or use the Lambda<N, Args..> form");
+
+
+  // Invoke the Lambda
+  camp::get<LoopIndex>(data.bodies)
+    ((camp::at_v<segment_types, OffsetIdx>)(camp::get<OffsetIdx>(data.segment_tuple).begin()[camp::get<OffsetIdx>(data.offset_tuple)])...,
+     camp::get<ParamIdx>(data.param_tuple)...);
+}
+
+
+template <camp::idx_t LoopIndex, typename Types, typename Data>
+RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(Data &&data)
+{
+  using Data_t = camp::decay<Data>;
+  using offset_tuple_t = typename Data_t::offset_tuple_t;
+  using param_tuple_t = typename Data_t::param_tuple_t;
+
+  invoke_lambda_expanded<LoopIndex, Types>(
+      camp::make_idx_seq_t<camp::tuple_size<offset_tuple_t>::value>{},
+      camp::make_idx_seq_t<camp::tuple_size<param_tuple_t>::value>{},
+      std::forward<Data>(data));
+}
+
+
+template <camp::idx_t LambdaIndex, typename Types>
+struct StatementExecutor<statement::Lambda<LambdaIndex>, Types> {
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data &&data)
+  {
+    invoke_lambda<LambdaIndex, Types>(std::forward<Data>(data));
+  }
+};
+
+
+
+
+//Extracts arguments from segments, and parameters
+template<typename Types, typename T>
+struct LambdaArgExtractor;
+
+template<typename Types, camp::idx_t id>
+struct LambdaArgExtractor<Types, LambdaArg<lambda_arg_offset_t, id>>
+{
+
+  // extract offset value type from LoopTypes
+  using type = camp::at_v<typename Types::offset_types_t, id>;
+
+  static_assert(!std::is_same<type, void>::value,
+      "Offset not assigned, but used in Lambda with Offsets<> argument");
+
+  template<typename Data>
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr
+  static type extract_arg(Data &&data)
+  {
+    return type(camp::get<id>(data.offset_tuple));
+  }
+
+};
+
+template<typename Types, camp::idx_t id>
+struct LambdaArgExtractor<Types, LambdaArg<lambda_arg_seg_t, id>>
+{
+
+  // extract segment value type from LoopTypes
+  using type = camp::at_v<typename Types::segment_types_t, id>;
+
+  static_assert(!std::is_same<type, void>::value,
+      "Segment not assigned, but used in Lambda with Segs<> argument");
+
+  template<typename Data>
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr
+  static type extract_arg(Data &&data)
   {
-    invoke_lambda<LambdaIndex>(std::forward<Data>(data));
+    return type(camp::get<id>(data.segment_tuple).begin()[camp::get<id>(data.offset_tuple)]);
   }
+
 };
 
+template<typename Types, camp::idx_t id>
+struct LambdaArgExtractor<Types, LambdaArg<lambda_arg_param_t, id>>
+{
+  template<typename Data>
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr
+  static auto extract_arg(Data &&data)->
+    typename std::add_lvalue_reference<camp::tuple_element_t<id,typename camp::decay<Data>::param_tuple_t>>::type
+  {
+    return camp::get<id>(data.param_tuple);
+  }
+};
+
+
+template<typename Types, typename T, camp::idx_t value>
+struct LambdaArgExtractor<Types, LambdaArg<lambda_arg_value_t<T>, value>>
+{
+  template<typename Data>
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr
+  static T extract_arg(Data &&)
+  {
+    return T(value);
+  }
+};
+
+
+
+RAJA_SUPPRESS_HD_WARN
+template<camp::idx_t LoopIndex, typename Types, typename Data, typename... targLists>
+RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda_with_args(Data &&data,
+                                                       camp::list<targLists...> const &)
+{
+  camp::get<LoopIndex>(data.bodies)(LambdaArgExtractor<Types, targLists>::extract_arg(data)...);
+}
+
+
+
 
 /*!
  * A RAJA::kernel statement that invokes a lambda function
  * with user specified arguments.
  */
-template <camp::idx_t LambdaIndex,typename... Args>
-struct StatementExecutor<statement::Lambda<LambdaIndex, Args...>> {
+template <camp::idx_t LambdaIndex,typename... Args, typename Types>
+struct StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types> {
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data &&data)
   {
 
     //Convert SegList, ParamList into Seg, Param types, and store in a list
     using targList = typename camp::flatten<camp::list<Args...>>::type;
 
-    invoke_lambda_with_args<LambdaIndex, targList>(std::forward<Data>(data));
+    invoke_lambda_with_args<LambdaIndex, Types>(std::forward<Data>(data), targList{});
   }
 };
 
diff --git a/include/RAJA/pattern/kernel/LambdaArgs.hpp b/include/RAJA/pattern/kernel/LambdaArgs.hpp
deleted file mode 100644
index a7b712e43e..0000000000
--- a/include/RAJA/pattern/kernel/LambdaArgs.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file for kernel conditional templates
- *
- ******************************************************************************
- */
-
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#ifndef RAJA_pattern_kernel_LambdaArgs_HPP
-#define RAJA_pattern_kernel_LambdaArgs_HPP
-
-
-#include "RAJA/config.hpp"
-
-#include "RAJA/pattern/kernel/internal.hpp"
-
-#include <iostream>
-#include <type_traits>
-
-namespace RAJA
-{
-
-namespace statement
-{
-
-struct seg_t
-{};
-
-struct param_t
-{};
-
-struct offset_t
-{};
-
-template<typename T, camp::idx_t ...>
-struct LambdaArgs
-{
-};
-
-template<camp::idx_t ... args> 
-using Segs = camp::list<LambdaArgs<seg_t, args>...>;
-
-template<camp::idx_t ... args> 
-using Offsets = camp::list<LambdaArgs<offset_t, args>...>;
-
-template<camp::idx_t ... args> 
-using Params = camp::list<LambdaArgs<param_t, args>...>;
-
-
-}  // namespace statement
-}  // end namespace RAJA
-
-
-#endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/Region.hpp b/include/RAJA/pattern/kernel/Region.hpp
index 1e5bf2117f..1a8d16821f 100644
--- a/include/RAJA/pattern/kernel/Region.hpp
+++ b/include/RAJA/pattern/kernel/Region.hpp
@@ -20,8 +20,6 @@
 
 #include "RAJA/config.hpp"
 #include "RAJA/pattern/region.hpp"
-#include "RAJA/policy/openmp/policy.hpp"
-#include "RAJA/policy/sequential/policy.hpp"
 
 #include <iostream>
 #include <type_traits>
@@ -36,8 +34,6 @@ template<typename RegionPolicy, typename... EnclosedStmts>
 struct Region : public internal::Statement<camp::nil> {
 };
 
-struct OmpSyncThreads : public internal::Statement<camp::nil> {  
-};
 
 }  // end namespace statement
 
@@ -48,8 +44,8 @@ namespace internal
 
 //Note: RAJA region's lambda must capture by reference otherwise
 //internal function calls are undefined.
-template<typename RegionPolicy, typename... EnclosedStmts>
-struct StatementExecutor<statement::Region<RegionPolicy, EnclosedStmts...> > {
+template<typename RegionPolicy, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::Region<RegionPolicy, EnclosedStmts...>, Types> {
 
 template<typename Data>
 static RAJA_INLINE void exec(Data &&data)
@@ -57,25 +53,12 @@ static RAJA_INLINE void exec(Data &&data)
 
   RAJA::region<RegionPolicy>([&]() {
       using data_t = camp::decay<Data>;
-      execute_statement_list<camp::list<EnclosedStmts...>>(data_t(data));
+      execute_statement_list<camp::list<EnclosedStmts...>, Types>(data_t(data));
     });
 }
 
 };
 
-#if defined(RAJA_ENABLE_OPENMP)
-//Statement executor to synchronize omp threads inside a kernel region
-template<>
-struct StatementExecutor<statement::OmpSyncThreads> {
-
-template<typename Data>
-static RAJA_INLINE void exec(Data &&)
-{
-  #pragma omp barrier
-}
-
-};
-#endif
 
 }  // namespace internal
 }  // end namespace RAJA
diff --git a/include/RAJA/pattern/kernel/Tile.hpp b/include/RAJA/pattern/kernel/Tile.hpp
index 351831ee7a..b791112d9a 100644
--- a/include/RAJA/pattern/kernel/Tile.hpp
+++ b/include/RAJA/pattern/kernel/Tile.hpp
@@ -33,6 +33,17 @@
 
 namespace RAJA
 {
+
+struct TileSize {
+  const camp::idx_t size;
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr TileSize(camp::idx_t size_) : size{size_}
+  {
+  }
+};
+
 namespace statement
 {
 
@@ -50,13 +61,20 @@ struct Tile : public internal::Statement<ExecPolicy, EnclosedStmts...> {
   using exec_policy_t = ExecPolicy;
 };
 
+}  // end namespace statement
+
 ///! tag for a tiling loop
 template <camp::idx_t chunk_size_>
 struct tile_fixed {
   static constexpr camp::idx_t chunk_size = chunk_size_;
 };
 
-}  // end namespace statement
+template <camp::idx_t ArgumentId>
+struct tile_dynamic {
+  static constexpr camp::idx_t id = ArgumentId;
+};
+
+
 
 namespace internal
 {
@@ -66,10 +84,10 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <camp::idx_t ArgumentId, typename Data, typename... EnclosedStmts>
-struct TileWrapper : public GenericWrapper<Data, EnclosedStmts...> {
+template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
+struct TileWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
 
-  using Base = GenericWrapper<Data, EnclosedStmts...>;
+  using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
   using privatizer = NestedPrivatizer<TileWrapper>;
 
@@ -199,12 +217,12 @@ struct IterableTiler {
  *
  */
 template <camp::idx_t ArgumentId,
-          typename TPol,
+          camp::idx_t ChunkSize,
           typename EPol,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
-    statement::Tile<ArgumentId, TPol, EPol, EnclosedStmts...>> {
-
+    statement::Tile<ArgumentId, tile_fixed<ChunkSize>, EPol, EnclosedStmts...>, Types> {
 
   template <typename Data>
   static RAJA_INLINE void exec(Data &data)
@@ -213,19 +231,54 @@ struct StatementExecutor<
     auto const &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Get the tiling policies chunk size
-    auto chunk_size = TPol::chunk_size;
+    auto chunk_size = tile_fixed<ChunkSize>::chunk_size;
 
     // Create a tile iterator, needs to survive until the forall is
     // done executing.
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileWrapper<ArgumentId, Data,
+    TileWrapper<ArgumentId, Data, Types,
                 EnclosedStmts...> tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
-    forall_impl(EPol{}, tiled_iterable, tile_wrapper);
+    auto r = resources::get_resource<EPol>::type::get_default();
+    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper);
+
+    // Set range back to original values
+    camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
+  }
+};
+
+template<camp::idx_t ArgumentId,
+  typename EPol,
+  typename... EnclosedStmts,
+  typename Types>
+struct StatementExecutor<
+    statement::Tile<ArgumentId, tile_dynamic<ArgumentId>, EPol, EnclosedStmts...>, Types> {
+
+  template <typename Data>
+  static RAJA_INLINE void exec(Data &data)
+  {
+    // Get the segment we are going to tile
+    auto const &segment = camp::get<ArgumentId>(data.segment_tuple);
+
+    // Get the tiling policies chunk size
+    auto chunk_size = camp::get<ArgumentId>(data.param_tuple);
+    static_assert(camp::concepts::metalib::is_same<TileSize, decltype(chunk_size)>::value,
+                  "Extracted parameter must be of type TileSize.");
+
+    // Create a tile iterator
+    IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size.size);
 
+    // Wrap in case forall_impl needs to thread_privatize
+    TileWrapper<ArgumentId, Data, Types,
+                EnclosedStmts...> tile_wrapper(data);
+
+    // Loop over tiles, executing enclosed statement list
+    auto r = resources::get_resource<EPol>::type::get_default();
+    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper);
+    
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
   }
diff --git a/include/RAJA/pattern/kernel/TileTCount.hpp b/include/RAJA/pattern/kernel/TileTCount.hpp
index db6c7070f9..f6655df0fd 100644
--- a/include/RAJA/pattern/kernel/TileTCount.hpp
+++ b/include/RAJA/pattern/kernel/TileTCount.hpp
@@ -66,11 +66,11 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <camp::idx_t ArgumentId, typename ParamId, typename Data,
+template <camp::idx_t ArgumentId, typename ParamId, typename Data, typename Types,
           typename... EnclosedStmts>
-struct TileTCountWrapper : public GenericWrapper<Data, EnclosedStmts...> {
+struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
 
-  using Base = GenericWrapper<Data, EnclosedStmts...>;
+  using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
   using privatizer = NestedPrivatizer<TileTCountWrapper>;
 
@@ -99,9 +99,10 @@ template <camp::idx_t ArgumentId,
           typename ParamId,
           typename TPol,
           typename EPol,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
-    statement::TileTCount<ArgumentId, ParamId, TPol, EPol, EnclosedStmts...>> {
+    statement::TileTCount<ArgumentId, ParamId, TPol, EPol, EnclosedStmts...>, Types> {
 
 
   template <typename Data>
@@ -118,11 +119,12 @@ struct StatementExecutor<
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileTCountWrapper<ArgumentId, ParamId, Data,
+    TileTCountWrapper<ArgumentId, ParamId, Data, Types,
                       EnclosedStmts...> tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
-    forall_impl(EPol{}, tiled_iterable, tile_wrapper);
+    auto r = resources::get_resource<EPol>::type::get_default();
+    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper);
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
diff --git a/include/RAJA/pattern/kernel/internal.hpp b/include/RAJA/pattern/kernel/internal.hpp
index 98ec850050..5063990c99 100644
--- a/include/RAJA/pattern/kernel/internal.hpp
+++ b/include/RAJA/pattern/kernel/internal.hpp
@@ -18,371 +18,11 @@
 #ifndef RAJA_pattern_kernel_internal_HPP
 #define RAJA_pattern_kernel_internal_HPP
 
-#include "RAJA/config.hpp"
 
-#include "RAJA/index/IndexSet.hpp"
-#include "RAJA/internal/LegacyCompatibility.hpp"
-#include "RAJA/util/macros.hpp"
-#include "RAJA/util/types.hpp"
-
-#include "camp/camp.hpp"
-#include "camp/concepts.hpp"
-#include "camp/tuple.hpp"
-
-#include "RAJA/pattern/detail/privatizer.hpp"
-#include "RAJA/pattern/kernel/ArgHelper.hpp"
-
-#include <iterator>
-#include <type_traits>
-
-namespace RAJA
-{
-namespace internal
-{
-
-
-template <typename... Stmts>
-using StatementList = camp::list<Stmts...>;
-
-
-template <typename ExecPolicy, typename... EnclosedStmts>
-struct Statement {
-  Statement() = delete;
-
-  using enclosed_statements_t = StatementList<EnclosedStmts...>;
-  using execution_policy_t = ExecPolicy;
-};
-
-
-template <typename T>
-using remove_all_t =
-    typename std::remove_cv<typename std::remove_reference<T>::type>::type;
-
-// Universal base of all For wrappers for type traits
-struct ForList {
-};
-struct ForBase {
-};
-struct CollapseBase {
-};
-template <camp::idx_t ArgumentId, typename Policy>
-struct ForTraitBase : public ForBase {
-  constexpr static camp::idx_t index_val = ArgumentId;
-  using index = camp::num<ArgumentId>;
-  using index_type = camp::nil;  // default to invalid type
-  using policy_type = Policy;
-  using type = ForTraitBase;  // make camp::value compatible
-};
-
-template <typename Iterator>
-struct iterable_difftype_getter {
-  using type = typename std::iterator_traits<
-      typename Iterator::iterator>::difference_type;
-};
-
-template <typename Segments>
-using difftype_list_from_segments =
-    typename camp::transform<iterable_difftype_getter, Segments>::type;
-
-
-template <typename Segments>
-using difftype_tuple_from_segments =
-    typename camp::apply_l<camp::lambda<camp::tuple>,
-                           difftype_list_from_segments<Segments>>::type;
-
-
-template <typename Iterator>
-struct iterable_value_type_getter {
-  using type =
-      typename std::iterator_traits<typename Iterator::iterator>::value_type;
-};
-
-template <typename Segments>
-using value_type_list_from_segments =
-    typename camp::transform<iterable_value_type_getter, Segments>::type;
-
-
-template <typename Segments>
-using index_tuple_from_segments =
-    typename camp::apply_l<camp::lambda<camp::tuple>,
-                           value_type_list_from_segments<Segments>>::type;
-
-
-template <typename Policy>
-struct StatementExecutor {
-};
-
-
-template <typename PolicyType,
-          typename SegmentTuple,
-          typename ParamTuple,
-          typename... Bodies>
-struct LoopData {
-
-  using Self = LoopData<PolicyType, SegmentTuple, ParamTuple, Bodies...>;
-
-  using offset_tuple_t =
-      difftype_tuple_from_segments<typename SegmentTuple::TList>;
-
-  using index_tuple_t = index_tuple_from_segments<typename SegmentTuple::TList>;
-
-  using policy_t = PolicyType;
-
-
-  using segment_tuple_t = SegmentTuple;
-  SegmentTuple segment_tuple;
-
-  using param_tuple_t = ParamTuple;
-  ParamTuple param_tuple;
-
-  using BodiesTuple = camp::tuple<Bodies...>;
-  const BodiesTuple bodies;
-  offset_tuple_t offset_tuple;
-
-  RAJA_INLINE
-  LoopData(SegmentTuple const &s, ParamTuple const &p, Bodies const &... b)
-      : segment_tuple(s), param_tuple(p), bodies(b...)
-  {
-    assign_begin_all();
-  }
-
-  template <typename PolicyType0,
-            typename SegmentTuple0,
-            typename ParamTuple0,
-            typename... Bodies0>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr LoopData(
-      LoopData<PolicyType0, SegmentTuple0, ParamTuple0, Bodies0...> &c)
-      : segment_tuple(c.segment_tuple),
-        param_tuple(c.param_tuple),
-        bodies(c.bodies),
-        offset_tuple(c.offset_tuple)
-  {
-  }
-
-  template <camp::idx_t Idx, typename IndexT>
-  RAJA_HOST_DEVICE RAJA_INLINE void assign_offset(IndexT const &i)
-  {
-    camp::get<Idx>(offset_tuple) = i;
-  }
-
-  template <typename ParamId, typename IndexT>
-  RAJA_HOST_DEVICE RAJA_INLINE void assign_param(IndexT const &i)
-  {
-    using param_t = camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>;
-    camp::get<ParamId::param_idx>(param_tuple) = param_t(i);
-  }
-
-  template <typename ParamId>
-  RAJA_HOST_DEVICE RAJA_INLINE
-  auto get_param() ->
-    camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>
-  {
-    return camp::get<ParamId::param_idx>(param_tuple);
-  }
-
-  template <camp::idx_t Idx>
-  RAJA_HOST_DEVICE RAJA_INLINE int assign_begin()
-  {
-    camp::get<Idx>(offset_tuple) = 0;
-    return 0;
-  }
-
-  template <camp::idx_t... Idx>
-  RAJA_HOST_DEVICE RAJA_INLINE void assign_begin_all_expanded(
-      camp::idx_seq<Idx...> const &)
-  {
-    VarOps::ignore_args(assign_begin<Idx>()...);
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  void assign_begin_all()
-  {
-    assign_begin_all_expanded(
-        camp::make_idx_seq_t<camp::tuple_size<offset_tuple_t>::value>{});
-  }
-
-
-  template <camp::idx_t... Idx>
-  RAJA_HOST_DEVICE RAJA_INLINE index_tuple_t
-  get_begin_index_tuple_expanded(camp::idx_seq<Idx...> const &) const
-  {
-    return camp::make_tuple((*camp::get<Idx>(segment_tuple).begin())...);
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  index_tuple_t get_begin_index_tuple() const
-  {
-    return get_begin_index_tuple_expanded(
-        camp::make_idx_seq_t<camp::tuple_size<offset_tuple_t>::value>{});
-  }
-
-
-  template <camp::idx_t... Idx>
-  RAJA_HOST_DEVICE RAJA_INLINE index_tuple_t
-  get_minimum_index_tuple_expanded(camp::idx_seq<Idx...> const &) const
-  {
-    return camp::make_tuple(
-        ((*camp::get<Idx>(segment_tuple).begin() <=
-          *camp::get<Idx>(segment_tuple).end())
-             ? *camp::get<Idx>(segment_tuple).begin()
-             : *(camp::get<Idx>(segment_tuple).end() - 1))...);
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  index_tuple_t get_minimum_index_tuple() const
-  {
-    return get_minimum_index_tuple_expanded(
-        camp::make_idx_seq_t<camp::tuple_size<offset_tuple_t>::value>{});
-  }
-};
-
-
-RAJA_SUPPRESS_HD_WARN
-template <camp::idx_t LoopIndex,
-          camp::idx_t... OffsetIdx,
-          camp::idx_t... ParamIdx,
-          typename Data>
-RAJA_HOST_DEVICE RAJA_INLINE void invoke_lambda_expanded(
-    camp::idx_seq<OffsetIdx...> const &,
-    camp::idx_seq<ParamIdx...> const &,
-    Data &&data)
-{
-  camp::get<LoopIndex>(data.bodies)
-    ((camp::get<OffsetIdx>(data.segment_tuple).begin()[camp::get<OffsetIdx>(data.offset_tuple)])...,
-     camp::get<ParamIdx>(data.param_tuple)...);
-}
-
-
-template <camp::idx_t LoopIndex, typename Data>
-RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(Data &&data)
-{
-  using Data_t = camp::decay<Data>;
-  using offset_tuple_t = typename Data_t::offset_tuple_t;
-  using param_tuple_t = typename Data_t::param_tuple_t;
-
-  invoke_lambda_expanded<LoopIndex>(
-      camp::make_idx_seq_t<camp::tuple_size<offset_tuple_t>::value>{},
-      camp::make_idx_seq_t<camp::tuple_size<param_tuple_t>::value>{},
-      std::forward<Data>(data));
-}
-
-RAJA_SUPPRESS_HD_WARN
-template<camp::idx_t LoopIndex, typename Data, typename... targLists>
-RAJA_INLINE RAJA_HOST_DEVICE void invoke_custom_lambda(Data &&data,
-                                                       camp::list<targLists...> const &)
-{
-  camp::get<LoopIndex>(data.bodies)(extractor<targLists>::extract_arg(data)...);
-}
-
-//Helper to launch lambda with custom arguments
-template <camp::idx_t LoopIndex, typename targList, typename Data>
-RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda_with_args(Data &&data)
-{
-
-  invoke_custom_lambda<LoopIndex>(data,targList{});
-                                     
-}
-
-template <camp::idx_t ArgumentId, typename Data>
-RAJA_INLINE RAJA_HOST_DEVICE auto segment_length(Data const &data) ->
-    typename std::iterator_traits<
-        typename camp::at_v<typename Data::segment_tuple_t::TList,
-                            ArgumentId>::iterator>::difference_type
-{
-  return camp::get<ArgumentId>(data.segment_tuple).end() -
-         camp::get<ArgumentId>(data.segment_tuple).begin();
-}
-
-
-template <camp::idx_t idx, camp::idx_t N, typename StmtList>
-struct StatementListExecutor;
-
-
-template <camp::idx_t statement_index,
-          camp::idx_t num_statements,
-          typename StmtList>
-struct StatementListExecutor {
-
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
-  {
-
-    // Get the statement we're going to execute
-    using statement = camp::at_v<StmtList, statement_index>;
-
-    // Execute this statement
-    StatementExecutor<statement>::exec(std::forward<Data>(data));
-
-    // call our next statement
-    StatementListExecutor<statement_index + 1, num_statements, StmtList>::exec(
-        std::forward<Data>(data));
-  }
-};
-
-
-/*
- * termination case, a NOP.
- */
-
-template <camp::idx_t num_statements, typename StmtList>
-struct StatementListExecutor<num_statements, num_statements, StmtList> {
-
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &&)
-  {
-  }
-};
-
-
-template <typename StmtList, typename Data>
-RAJA_INLINE void execute_statement_list(Data &&data)
-{
-  StatementListExecutor<0, camp::size<StmtList>::value, StmtList>::exec(
-      std::forward<Data>(data));
-}
-
-template <typename Data, typename... EnclosedStmts>
-struct GenericWrapper : GenericWrapperBase {
-  using data_t = camp::decay<Data>;
-
-  data_t &data;
-
-  RAJA_INLINE
-  constexpr explicit GenericWrapper(data_t &d) : data{d} {}
-
-  RAJA_INLINE
-  void exec() { execute_statement_list<camp::list<EnclosedStmts...>>(data); }
-};
-
-
-/*!
- * Convenience object used to create thread-private a LoopData object.
- */
-template <typename T>
-struct NestedPrivatizer {
-  using data_t = typename T::data_t;
-  using value_type = camp::decay<T>;
-  using reference_type = value_type &;
-
-  data_t privatized_data;
-  value_type privatized_wrapper;
-
-  RAJA_INLINE
-  constexpr NestedPrivatizer(const T &o)
-      : privatized_data{o.data}, privatized_wrapper(privatized_data)
-  {
-  }
-
-  RAJA_INLINE
-  reference_type get_priv() { return privatized_wrapper; }
-};
-
-
-}  // end namespace internal
-}  // end namespace RAJA
+#include "RAJA/pattern/kernel/internal/LoopData.hpp"
+#include "RAJA/pattern/kernel/internal/LoopTypes.hpp"
+#include "RAJA/pattern/kernel/internal/Statement.hpp"
+#include "RAJA/pattern/kernel/internal/StatementList.hpp"
 
 
 #endif /* RAJA_pattern_kernel_internal_HPP */
diff --git a/include/RAJA/pattern/kernel/internal/LoopData.hpp b/include/RAJA/pattern/kernel/internal/LoopData.hpp
new file mode 100644
index 0000000000..ff6562934f
--- /dev/null
+++ b/include/RAJA/pattern/kernel/internal/LoopData.hpp
@@ -0,0 +1,220 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file for loop kernel internals: LoopData structure and
+ *          related helper functions.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_pattern_kernel_internal_LoopData_HPP
+#define RAJA_pattern_kernel_internal_LoopData_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/index/IndexSet.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
+
+#include "camp/camp.hpp"
+
+#include "RAJA/pattern/detail/privatizer.hpp"
+#include "RAJA/pattern/kernel/internal/StatementList.hpp"
+
+#include <iterator>
+#include <type_traits>
+
+namespace RAJA
+{
+namespace internal
+{
+
+
+
+
+  // Universal base of all For wrappers for type traits
+  struct ForList {
+  };
+  struct ForBase {
+  };
+  struct CollapseBase {
+  };
+  template <camp::idx_t ArgumentId, typename Policy>
+  struct ForTraitBase : public ForBase {
+    constexpr static camp::idx_t index_val = ArgumentId;
+    using index = camp::num<ArgumentId>;
+    using index_type = camp::nil;  // default to invalid type
+    using policy_type = Policy;
+    using type = ForTraitBase;  // make camp::value compatible
+  };
+
+
+
+
+template <typename Iterator>
+struct iterable_difftype_getter {
+  using type = typename std::iterator_traits<
+      typename Iterator::iterator>::difference_type;
+};
+
+template <typename Segments>
+using difftype_list_from_segments =
+    typename camp::transform<iterable_difftype_getter, Segments>::type;
+
+
+template <typename Segments>
+using difftype_tuple_from_segments =
+    typename camp::apply_l<camp::lambda<camp::tuple>,
+                           difftype_list_from_segments<Segments>>::type;
+
+
+template <typename Iterator>
+struct iterable_value_type_getter {
+  using type =
+      typename std::iterator_traits<typename Iterator::iterator>::value_type;
+};
+
+template <typename Segments>
+using value_type_list_from_segments =
+    typename camp::transform<iterable_value_type_getter, Segments>::type;
+
+
+template <typename Segments>
+using index_tuple_from_segments =
+    typename camp::apply_l<camp::lambda<camp::tuple>,
+                           value_type_list_from_segments<Segments>>::type;
+
+
+
+
+template <typename SegmentTuple,
+          typename ParamTuple,
+          typename... Bodies>
+struct LoopData {
+
+  using Self = LoopData<SegmentTuple, ParamTuple, Bodies...>;
+
+  using offset_tuple_t =
+      difftype_tuple_from_segments<typename SegmentTuple::TList>;
+
+  using index_tuple_t = index_tuple_from_segments<typename SegmentTuple::TList>;
+
+
+  using segment_tuple_t = SegmentTuple;
+  SegmentTuple segment_tuple;
+
+  using param_tuple_t = ParamTuple;
+  ParamTuple param_tuple;
+
+  using BodiesTuple = camp::tuple<Bodies...>;
+  const BodiesTuple bodies;
+  offset_tuple_t offset_tuple;
+
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr
+  LoopData(SegmentTuple const &s, ParamTuple const &p, Bodies const &... b)
+      : segment_tuple(s), param_tuple(p), bodies(b...)
+  {
+    //assign_begin_all();
+  }
+  constexpr LoopData(LoopData const &) = default;
+  constexpr LoopData(LoopData &&) = default;
+
+  template <camp::idx_t Idx, typename IndexT>
+  RAJA_HOST_DEVICE RAJA_INLINE void assign_offset(IndexT const &i)
+  {
+    camp::get<Idx>(offset_tuple) = i;
+  }
+
+  template <typename ParamId, typename IndexT>
+  RAJA_HOST_DEVICE RAJA_INLINE void assign_param(IndexT const &i)
+  {
+    using param_t = camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>;
+    camp::get<ParamId::param_idx>(param_tuple) = param_t(i);
+  }
+
+  template <typename ParamId>
+  RAJA_HOST_DEVICE RAJA_INLINE
+  auto get_param() ->
+    camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>
+  {
+    return camp::get<ParamId::param_idx>(param_tuple);
+  }
+
+
+};
+
+
+
+
+template <camp::idx_t ArgumentId, typename Data>
+using segment_diff_type =
+    typename std::iterator_traits<
+        typename camp::at_v<typename Data::segment_tuple_t::TList,
+                            ArgumentId>::iterator>::difference_type;
+
+
+
+
+template <camp::idx_t ArgumentId, typename Data>
+RAJA_INLINE RAJA_HOST_DEVICE auto segment_length(Data const &data) ->
+  segment_diff_type<ArgumentId, Data>
+{
+  return camp::get<ArgumentId>(data.segment_tuple).end() -
+         camp::get<ArgumentId>(data.segment_tuple).begin();
+}
+
+
+
+
+template <typename Data, typename Types, typename... EnclosedStmts>
+struct GenericWrapper : GenericWrapperBase {
+  using data_t = camp::decay<Data>;
+
+  data_t &data;
+
+  RAJA_INLINE
+  constexpr explicit GenericWrapper(data_t &d) : data{d} {}
+
+  RAJA_INLINE
+  void exec() { execute_statement_list<camp::list<EnclosedStmts...>, Types>(data); }
+};
+
+
+/*!
+ * Convenience object used to create thread-private a LoopData object.
+ */
+template <typename T>
+struct NestedPrivatizer {
+  using data_t = typename T::data_t;
+  using value_type = camp::decay<T>;
+  using reference_type = value_type &;
+
+  data_t privatized_data;
+  value_type privatized_wrapper;
+
+  RAJA_INLINE
+  constexpr NestedPrivatizer(const T &o)
+      : privatized_data{o.data}, privatized_wrapper(privatized_data)
+  {
+  }
+
+  RAJA_INLINE
+  reference_type get_priv() { return privatized_wrapper; }
+};
+
+
+
+}  // end namespace internal
+}  // end namespace RAJA
+
+
+#endif /* RAJA_pattern_kernel_internal_LoopData_HPP */
diff --git a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
new file mode 100644
index 0000000000..0e75336255
--- /dev/null
+++ b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
@@ -0,0 +1,125 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file for loop kernel internals: LoopData structure and
+ *          related helper functions.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_pattern_kernel_internal_LoopTypes_HPP
+#define RAJA_pattern_kernel_internal_LoopTypes_HPP
+
+#include "RAJA/config.hpp"
+
+#include "camp/camp.hpp"
+
+
+namespace RAJA
+{
+namespace internal
+{
+
+namespace detail
+{
+// Helper class to convert a camp::idx_t into some type T
+// used in template expansion in ListOfNHelper
+template <typename T, camp::idx_t>
+struct SeqToType
+{
+  using type = T;
+};
+
+template <typename T, typename SEQ>
+struct ListOfNHelper;
+
+template <typename T, camp::idx_t ... SEQ>
+struct ListOfNHelper<T, camp::idx_seq<SEQ...> >
+{
+  using type = camp::list<typename SeqToType<T, SEQ>::type...>;
+};
+} // namespace detail
+
+/*
+ *  This creates a camp::list with N types, each one being T.
+ *
+ *  That is, list_of_n<T, 4>  ==  camp::list<T, T, T, T>
+ *
+ */
+template <typename T, camp::idx_t N>
+using list_of_n = typename detail::ListOfNHelper<T, camp::make_idx_seq_t<N>>::type;
+
+
+template <typename SegmentTypes,
+          typename OffsetTypes>
+struct LoopTypes;
+
+template <typename ... SegmentTypes,
+          typename ... OffsetTypes>
+struct LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>> {
+
+  using Self = LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>;
+
+  static constexpr size_t s_num_segments = sizeof...(SegmentTypes);
+
+  // This ensures that you don't double-loop over a segment within the same
+  // loop nesting
+  static_assert(s_num_segments == sizeof...(OffsetTypes),
+      "Number of segments and offsets must match");
+
+  using segment_types_t = camp::list<SegmentTypes...>;
+  using offset_types_t = camp::list<OffsetTypes...>;
+};
+
+
+template<typename Data>
+using makeInitialLoopTypes =
+    LoopTypes<list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>,
+              list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>>;
+
+
+template<typename Types, camp::idx_t Segment, typename T, typename Seq>
+struct SetSegmentTypeHelper;
+
+template<typename Types,
+         camp::idx_t Segment,
+         typename T,
+         camp::idx_t ... SEQ>
+struct SetSegmentTypeHelper<Types, Segment, T, camp::idx_seq<SEQ...>>
+{
+    using segment_list = typename Types::segment_types_t;
+    using offset_list = typename Types::offset_types_t;
+
+    static_assert(std::is_same<camp::at_v<segment_list, Segment>, void>::value,
+        "Segment was already assigned: Probably looping over same segment in loop nest");
+
+    using type = LoopTypes<
+        camp::list<typename std::conditional<SEQ == Segment, T, camp::at_v<segment_list, SEQ>>::type...>,
+        camp::list<typename std::conditional<SEQ == Segment, T, camp::at_v<segment_list, SEQ>>::type...>>;
+
+};
+
+
+template<typename Types, camp::idx_t Segment, typename T>
+using setSegmentType =
+    typename SetSegmentTypeHelper<Types, Segment, T, camp::make_idx_seq_t<Types::s_num_segments>>::type;
+
+template<typename Types, camp::idx_t Segment, typename Data>
+using setSegmentTypeFromData =
+    setSegmentType<Types, Segment, camp::at_v<typename camp::decay<Data>::index_tuple_t::TList, Segment>>;
+
+
+}  // end namespace internal
+}  // end namespace RAJA
+
+
+#endif /* RAJA_pattern_kernel_internal_LoopData_HPP */
diff --git a/include/RAJA/pattern/kernel/internal/Statement.hpp b/include/RAJA/pattern/kernel/internal/Statement.hpp
new file mode 100644
index 0000000000..478cc60b51
--- /dev/null
+++ b/include/RAJA/pattern/kernel/internal/Statement.hpp
@@ -0,0 +1,54 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file for loop kernel internals.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_pattern_kernel_internal_Statement_HPP
+#define RAJA_pattern_kernel_internal_Statement_HPP
+
+#include "RAJA/pattern/kernel/internal/StatementList.hpp"
+#include <type_traits>
+#include <camp/camp.hpp>
+
+namespace RAJA
+{
+namespace internal
+{
+
+
+
+template <typename ExecPolicy, typename... EnclosedStmts>
+struct Statement {
+  static_assert(std::is_same<ExecPolicy, camp::nil>::value || sizeof...(EnclosedStmts) > 0,
+      "Executable statement with no enclosed statements, this is almost certainly a bug");
+  Statement() = delete;
+
+  using enclosed_statements_t = StatementList<EnclosedStmts...>;
+  using execution_policy_t = ExecPolicy;
+};
+
+
+
+
+template <typename Policy, typename Types>
+struct StatementExecutor;
+
+
+
+}  // end namespace internal
+}  // end namespace RAJA
+
+
+#endif /* RAJA_pattern_kernel_internal_HPP */
diff --git a/include/RAJA/pattern/kernel/internal/StatementList.hpp b/include/RAJA/pattern/kernel/internal/StatementList.hpp
new file mode 100644
index 0000000000..4bf85edc00
--- /dev/null
+++ b/include/RAJA/pattern/kernel/internal/StatementList.hpp
@@ -0,0 +1,97 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file for loop kernel internals.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_pattern_kernel_internal_StatementList_HPP
+#define RAJA_pattern_kernel_internal_StatementList_HPP
+
+#include "RAJA/config.hpp"
+#include "RAJA/util/macros.hpp"
+#include "camp/camp.hpp"
+
+#include <type_traits>
+
+namespace RAJA
+{
+namespace internal
+{
+
+
+// forward decl
+template <typename Policy, typename Types>
+struct StatementExecutor;
+
+
+
+
+template <typename... Stmts>
+using StatementList = camp::list<Stmts...>;
+
+
+template <camp::idx_t idx, camp::idx_t N, typename StmtList, typename Types>
+struct StatementListExecutor;
+
+
+template <camp::idx_t statement_index,
+          camp::idx_t num_statements,
+          typename StmtList, typename Types>
+struct StatementListExecutor {
+
+  template <typename Data>
+  static RAJA_INLINE void exec(Data &&data)
+  {
+
+    // Get the statement we're going to execute
+    using statement = camp::at_v<StmtList, statement_index>;
+
+    // Execute this statement
+    StatementExecutor<statement, Types>::exec(std::forward<Data>(data));
+
+    // call our next statement
+    StatementListExecutor<statement_index + 1, num_statements, StmtList, Types>::exec(
+        std::forward<Data>(data));
+  }
+};
+
+
+/*
+ * termination case, a NOP.
+ */
+
+template <camp::idx_t num_statements, typename StmtList, typename Types>
+struct StatementListExecutor<num_statements, num_statements, StmtList, Types> {
+
+  template <typename Data>
+  static RAJA_INLINE void exec(Data &&)
+  {
+  }
+};
+
+
+template <typename StmtList, typename Types, typename Data>
+RAJA_INLINE void execute_statement_list(Data &&data)
+{
+  StatementListExecutor<0, camp::size<StmtList>::value, StmtList, Types>::exec(
+      std::forward<Data>(data));
+}
+
+
+
+}  // end namespace internal
+}  // end namespace RAJA
+
+
+#endif /* RAJA_pattern_kernel_internal_HPP */
diff --git a/include/RAJA/pattern/reduce.hpp b/include/RAJA/pattern/reduce.hpp
index 7e658d1b0e..31e2e526f3 100644
--- a/include/RAJA/pattern/reduce.hpp
+++ b/include/RAJA/pattern/reduce.hpp
@@ -180,6 +180,58 @@ class ReduceMaxLoc;
  */
 template <typename REDUCE_POLICY_T, typename T>
 class ReduceSum;
-}  // namespace RAJA
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  Bitwise OR reducer class template.
+ *
+ * Usage example:
+ *
+ * \verbatim
+
+   Real_ptr data = ...;
+   ReduceBitOr<reduce_policy, Real_type> my_bits(init_val);
+
+   forall<exec_policy>( ..., [=] (Index_type i) {
+      my_bits |= data[i];
+   }
+
+   Real_type finbits = my_bits.get();
+
+ * \endverbatim
+ *
+ ******************************************************************************
+ */
+template <typename REDUCE_POLICY_T, typename T>
+class ReduceBitOr;
+ 
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  Bitwise AND reducer class template.
+ *
+ * Usage example:
+ *
+ * \verbatim
+
+   Real_ptr data = ...;
+   ReduceBitAnd<reduce_policy, Real_type> my_bits(init_val);
+
+   forall<exec_policy>( ..., [=] (Index_type i) {
+      my_bits &= data[i];
+   }
+
+   Real_type finbits = my_bits.get();
+
+ * \endverbatim
+ *
+ ******************************************************************************
+ */
+template <typename REDUCE_POLICY_T, typename T>
+class ReduceBitAnd;
+} //namespace RAJA
+
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/scan.hpp b/include/RAJA/pattern/scan.hpp
index 2a23bb9add..41597a13c3 100644
--- a/include/RAJA/pattern/scan.hpp
+++ b/include/RAJA/pattern/scan.hpp
@@ -23,27 +23,14 @@
 #include <iterator>
 #include <type_traits>
 
-#include "camp/concepts.hpp"
-#include "camp/helpers.hpp"
-
 #include "RAJA/policy/PolicyBase.hpp"
+#include "RAJA/util/concepts.hpp"
 #include "RAJA/util/Operators.hpp"
+#include "RAJA/pattern/detail/algorithm.hpp"
 
 namespace RAJA
 {
 
-namespace detail
-{
-
-template <typename Iter>
-using IterVal = camp::decay<decltype(*camp::val<Iter>())>;
-
-template <typename Container>
-using ContainerVal =
-    camp::decay<decltype(*camp::val<camp::iterator_from<Container>>())>;
-
-}  // end namespace detail
-
 /*!
 ******************************************************************************
 *
@@ -60,7 +47,7 @@ using ContainerVal =
 */
 template <typename ExecPolicy,
           typename Iter,
-          typename Function = operators::plus<detail::IterVal<Iter>>>
+          typename Function = operators::plus<RAJA::detail::IterVal<Iter>>>
 concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>,
                     type_traits::is_iterator<Iter>>
 inclusive_scan_inplace(const ExecPolicy &p,
@@ -68,11 +55,14 @@ inclusive_scan_inplace(const ExecPolicy &p,
                        Iter end,
                        Function binop = Function{})
 {
-  using R = detail::IterVal<Iter>;
+  using R = RAJA::detail::IterVal<Iter>;
   static_assert(type_traits::is_binary_function<Function, R, R, R>::value,
                 "Function must model BinaryFunction");
   static_assert(type_traits::is_random_access_iterator<Iter>::value,
                 "Iterator must model RandomAccessIterator");
+  if (begin == end) {
+    return;
+  }
   impl::scan::inclusive_inplace(p, begin, end, binop);
 }
 
@@ -92,7 +82,7 @@ inclusive_scan_inplace(const ExecPolicy &p,
 */
 template <typename ExecPolicy,
           typename Iter,
-          typename T = detail::IterVal<Iter>,
+          typename T = RAJA::detail::IterVal<Iter>,
           typename Function = operators::plus<T>>
 concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>,
                     type_traits::is_iterator<Iter>>
@@ -102,11 +92,14 @@ exclusive_scan_inplace(const ExecPolicy &p,
                        Function binop = Function{},
                        T value = Function::identity())
 {
-  using R = detail::IterVal<Iter>;
+  using R = RAJA::detail::IterVal<Iter>;
   static_assert(type_traits::is_binary_function<Function, R, T, R>::value,
                 "Function must model BinaryFunction");
   static_assert(type_traits::is_random_access_iterator<Iter>::value,
                 "Iterator must model RandomAccessIterator");
+  if (begin == end) {
+    return;
+  }
   impl::scan::exclusive_inplace(p, begin, end, binop, value);
 }
 
@@ -124,14 +117,14 @@ exclusive_scan_inplace(const ExecPolicy &p,
 * \param[in] binop binary function to apply for scan
 * \param[in] value identity value for binary function, binop
 *
-* \note{The range of [begin, end) must be separate from [out, out + (end -
-*begin))}
+* \note{The range of [begin, end) must be separate from [out, out + dist (begin,
+*end))}
 ******************************************************************************
 */
 template <typename ExecPolicy,
           typename Iter,
           typename IterOut,
-          typename Function = operators::plus<detail::IterVal<Iter>>>
+          typename Function = operators::plus<RAJA::detail::IterVal<Iter>>>
 concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>,
                     type_traits::is_iterator<Iter>,
                     type_traits::is_iterator<IterOut>>
@@ -141,14 +134,17 @@ inclusive_scan(const ExecPolicy &p,
                IterOut out,
                Function binop = Function{})
 {
-  using R = detail::IterVal<IterOut>;
-  using T = detail::IterVal<Iter>;
+  using R = RAJA::detail::IterVal<IterOut>;
+  using T = RAJA::detail::IterVal<Iter>;
   static_assert(type_traits::is_binary_function<Function, R, T, R>::value,
                 "Function must model BinaryFunction");
   static_assert(type_traits::is_random_access_iterator<Iter>::value,
                 "Iterator must model RandomAccessIterator");
   static_assert(type_traits::is_random_access_iterator<IterOut>::value,
                 "Output Iterator must model RandomAccessIterator");
+  if (begin == end) {
+    return;
+  }
   impl::scan::inclusive(p, begin, end, out, binop);
 }
 
@@ -166,14 +162,14 @@ inclusive_scan(const ExecPolicy &p,
 * \param[in] binop binary function to apply for scan
 * \param[in] value identity value for binary function, binop
 *
-* \note{The range of [begin, end) must be separate from [out, out + (end -
-*begin))}
+* \note{The range of [begin, end) must be separate from [out, out + dist (begin,
+*end))}
 ******************************************************************************
 */
 template <typename ExecPolicy,
           typename Iter,
           typename IterOut,
-          typename T = detail::IterVal<Iter>,
+          typename T = RAJA::detail::IterVal<Iter>,
           typename Function = operators::plus<T>>
 concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>,
                     type_traits::is_iterator<Iter>,
@@ -185,14 +181,17 @@ exclusive_scan(const ExecPolicy &p,
                Function binop = Function{},
                T value = Function::identity())
 {
-  using R = detail::IterVal<IterOut>;
-  using U = detail::IterVal<Iter>;
+  using R = RAJA::detail::IterVal<IterOut>;
+  using U = RAJA::detail::IterVal<Iter>;
   static_assert(type_traits::is_binary_function<Function, R, T, U>::value,
                 "Function must model BinaryFunction");
   static_assert(type_traits::is_random_access_iterator<Iter>::value,
                 "Iterator must model RandomAccessIterator");
   static_assert(type_traits::is_random_access_iterator<IterOut>::value,
                 "Output Iterator must model RandomAccessIterator");
+  if (begin == end) {
+    return;
+  }
   impl::scan::exclusive(p, begin, end, out, binop, value);
 }
 
@@ -204,7 +203,7 @@ exclusive_scan(const ExecPolicy &p,
 * \brief  inclusive in-place scan execution pattern
 *
 * \param[in] p Execution policy
-* \param[in,out] Random-Access Range
+* \param[in,out] c Random-Access Container
 * \param[in] binop binary function to apply for scan
 * \param[in] value identity value for binary function, binop
 *
@@ -212,18 +211,21 @@ exclusive_scan(const ExecPolicy &p,
 */
 template <typename ExecPolicy,
           typename Container,
-          typename Function = operators::plus<detail::ContainerVal<Container>>>
+          typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>>
 concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>,
                     type_traits::is_range<Container>>
 inclusive_scan_inplace(const ExecPolicy &p,
                        Container &c,
                        Function binop = Function{})
 {
-  using R = detail::ContainerVal<Container>;
+  using R = RAJA::detail::ContainerVal<Container>;
   static_assert(type_traits::is_binary_function<Function, R, R, R>::value,
                 "Function must model BinaryFunction");
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container must model RandomAccessRange");
+  if (std::begin(c) == std::end(c)) {
+    return;
+  }
   impl::scan::inclusive_inplace(p, std::begin(c), std::end(c), binop);
 }
 
@@ -241,7 +243,7 @@ inclusive_scan_inplace(const ExecPolicy &p,
 */
 template <typename ExecPolicy,
           typename Container,
-          typename T = detail::ContainerVal<Container>,
+          typename T = RAJA::detail::ContainerVal<Container>,
           typename Function = operators::plus<T>>
 concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>,
                     type_traits::is_range<Container>>
@@ -250,11 +252,14 @@ exclusive_scan_inplace(const ExecPolicy &p,
                        Function binop = Function{},
                        T value = Function::identity())
 {
-  using R = detail::ContainerVal<Container>;
+  using R = RAJA::detail::ContainerVal<Container>;
   static_assert(type_traits::is_binary_function<Function, R, T, R>::value,
                 "Function must model BinaryFunction");
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container must model RandomAccessRange");
+  if (std::begin(c) == std::end(c)) {
+    return;
+  }
   impl::scan::exclusive_inplace(p, std::begin(c), std::end(c), binop, value);
 }
 
@@ -264,9 +269,7 @@ exclusive_scan_inplace(const ExecPolicy &p,
 * \brief  inclusive scan execution pattern
 *
 * \param[in] p Execution policy
-* \param[in] begin Pointer or Random-Access Iterator to start of data range
-* \param[in] end Pointer or Random-Access Iterator to end of data range
-*(exclusive)
+* \param[in] c Random-Access Container
 * \param[out] out Pointer or Random-Access Iterator to start of output data
 *range
 * \param[in] binop binary function to apply for scan
@@ -279,23 +282,26 @@ exclusive_scan_inplace(const ExecPolicy &p,
 template <typename ExecPolicy,
           typename Container,
           typename IterOut,
-          typename Function = operators::plus<detail::ContainerVal<Container>>>
+          typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>>
 concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>,
                     type_traits::is_range<Container>,
                     type_traits::is_iterator<IterOut>>
 inclusive_scan(const ExecPolicy &p,
-               Container &c,
+               const Container &c,
                IterOut out,
                Function binop = Function{})
 {
-  using R = detail::IterVal<IterOut>;
-  using T = detail::ContainerVal<Container>;
+  using R = RAJA::detail::IterVal<IterOut>;
+  using T = RAJA::detail::ContainerVal<Container>;
   static_assert(type_traits::is_binary_function<Function, R, T, R>::value,
                 "Function must model BinaryFunction");
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container must model RandomAccessRange");
   static_assert(type_traits::is_random_access_iterator<IterOut>::value,
                 "Output Iterator must model RandomAccessIterator");
+  if (std::begin(c) == std::end(c)) {
+    return;
+  }
   impl::scan::inclusive(p, std::begin(c), std::end(c), out, binop);
 }
 
@@ -305,9 +311,7 @@ inclusive_scan(const ExecPolicy &p,
 * \brief  exclusive scan execution pattern
 *
 * \param[in] p Execution policy
-* \param[in] begin Pointer or Random-Access Iterator to start of data range
-* \param[in] end Pointer or Random-Access Iterator to end of data range
-*(exclusive)
+* \param[in] c Random-Access Container
 * \param[out] out Pointer or Random-Access Iterator to start of output data
 *range
 * \param[in] binop binary function to apply for scan
@@ -320,25 +324,28 @@ inclusive_scan(const ExecPolicy &p,
 template <typename ExecPolicy,
           typename Container,
           typename IterOut,
-          typename T = detail::ContainerVal<Container>,
+          typename T = RAJA::detail::ContainerVal<Container>,
           typename Function = operators::plus<T>>
 concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>,
                     type_traits::is_range<Container>,
                     type_traits::is_iterator<IterOut>>
 exclusive_scan(const ExecPolicy &p,
-               Container &c,
+               const Container &c,
                IterOut out,
                Function binop = Function{},
                T value = Function::identity())
 {
-  using R = detail::IterVal<IterOut>;
-  using U = detail::ContainerVal<Container>;
+  using R = RAJA::detail::IterVal<IterOut>;
+  using U = RAJA::detail::ContainerVal<Container>;
   static_assert(type_traits::is_binary_function<Function, R, T, U>::value,
                 "Function must model BinaryFunction");
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container must model RandomAccessRange");
   static_assert(type_traits::is_random_access_iterator<IterOut>::value,
                 "Output Iterator must model RandomAccessIterator");
+  if (std::begin(c) == std::end(c)) {
+    return;
+  }
   impl::scan::exclusive(p, std::begin(c), std::end(c), out, binop, value);
 }
 
diff --git a/include/RAJA/pattern/sort.hpp b/include/RAJA/pattern/sort.hpp
new file mode 100644
index 0000000000..71c9749bfe
--- /dev/null
+++ b/include/RAJA/pattern/sort.hpp
@@ -0,0 +1,332 @@
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file providing RAJA sort declarations.
+*
+******************************************************************************
+*/
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_sort_HPP
+#define RAJA_sort_HPP
+
+#include "RAJA/config.hpp"
+
+#include <iterator>
+#include <type_traits>
+
+#include "RAJA/policy/PolicyBase.hpp"
+#include "RAJA/util/concepts.hpp"
+#include "RAJA/util/Operators.hpp"
+#include "RAJA/pattern/detail/algorithm.hpp"
+
+namespace RAJA
+{
+
+/*!
+******************************************************************************
+*
+* \brief  sort execution pattern
+*
+* \param[in] p Execution policy
+* \param[in,out] begin Pointer or Random-Access Iterator to start of data range
+* \param[in,out] end Pointer or Random-Access Iterator to end of data range
+*(exclusive)
+* \param[in] comp comparison function to apply for sort
+*
+******************************************************************************
+*/
+template <typename ExecPolicy,
+          typename Iter,
+          typename Compare = operators::less<RAJA::detail::IterVal<Iter>>>
+concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>,
+                    type_traits::is_iterator<Iter>>
+sort(const ExecPolicy &p,
+     Iter begin,
+     Iter end,
+     Compare comp = Compare{})
+{
+  using R = RAJA::detail::IterVal<Iter>;
+  static_assert(type_traits::is_binary_function<Compare, bool, R, R>::value,
+                "Compare must model BinaryFunction");
+  static_assert(type_traits::is_random_access_iterator<Iter>::value,
+                "Iterator must model RandomAccessIterator");
+  impl::sort::unstable(p, begin, end, comp);
+}
+
+/*!
+******************************************************************************
+*
+* \brief  stable sort execution pattern
+*
+* \param[in] p Execution policy
+* \param[in,out] begin Pointer or Random-Access Iterator to start of data range
+* \param[in,out] end Pointer or Random-Access Iterator to end of data range
+*(exclusive)
+* \param[in] comp comparison function to apply for stable_sort
+*
+******************************************************************************
+*/
+template <typename ExecPolicy,
+          typename Iter,
+          typename Compare = operators::less<RAJA::detail::IterVal<Iter>>>
+concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>,
+                    type_traits::is_iterator<Iter>>
+stable_sort(const ExecPolicy &p,
+            Iter begin,
+            Iter end,
+            Compare comp = Compare{})
+{
+  using R = RAJA::detail::IterVal<Iter>;
+  static_assert(type_traits::is_binary_function<Compare, bool, R, R>::value,
+                "Compare must model BinaryFunction");
+  static_assert(type_traits::is_random_access_iterator<Iter>::value,
+                "Iterator must model RandomAccessIterator");
+  impl::sort::stable(p, begin, end, comp);
+}
+
+/*!
+******************************************************************************
+*
+* \brief  sort pairs execution pattern
+*
+* \param[in] p Execution policy
+* \param[in,out] keys_begin Pointer or Random-Access Iterator to start of data keys range
+* \param[in,out] keys_end Pointer or Random-Access Iterator to end of data keys range
+* \param[in,out] vals_begin Pointer or Random-Access Iterator to start of data values range
+* \param[in] comp comparison function to apply for sort
+*
+******************************************************************************
+*/
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare = operators::less<RAJA::detail::IterVal<KeyIter>>>
+concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>,
+                    type_traits::is_iterator<KeyIter>,
+                    type_traits::is_iterator<ValIter>>
+sort_pairs(const ExecPolicy &p,
+           KeyIter keys_begin,
+           KeyIter keys_end,
+           ValIter vals_begin,
+           Compare comp = Compare{})
+{
+  using R = RAJA::detail::IterVal<KeyIter>;
+  static_assert(type_traits::is_binary_function<Compare, bool, R, R>::value,
+                "Compare must model BinaryFunction");
+  static_assert(type_traits::is_random_access_iterator<KeyIter>::value,
+                "Keys Iterator must model RandomAccessIterator");
+  static_assert(type_traits::is_random_access_iterator<ValIter>::value,
+                "Vals Iterator must model RandomAccessIterator");
+  impl::sort::unstable_pairs(p, keys_begin, keys_end, vals_begin, comp);
+}
+
+/*!
+******************************************************************************
+*
+* \brief  stable sort pairs execution pattern
+*
+* \param[in] p Execution policy
+* \param[in,out] keys_begin Pointer or Random-Access Iterator to start of data keys range
+* \param[in,out] keys_end Pointer or Random-Access Iterator to end of data keys range
+* \param[in,out] vals_begin Pointer or Random-Access Iterator to start of data values range
+* \param[in] comp comparison function to apply for stable_sort
+*
+******************************************************************************
+*/
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare = operators::less<RAJA::detail::IterVal<KeyIter>>>
+concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>,
+                    type_traits::is_iterator<KeyIter>,
+                    type_traits::is_iterator<ValIter>>
+stable_sort_pairs(const ExecPolicy &p,
+                  KeyIter keys_begin,
+                  KeyIter keys_end,
+                  ValIter vals_begin,
+                  Compare comp = Compare{})
+{
+  using R = RAJA::detail::IterVal<KeyIter>;
+  static_assert(type_traits::is_binary_function<Compare, bool, R, R>::value,
+                "Compare must model BinaryFunction");
+  static_assert(type_traits::is_random_access_iterator<KeyIter>::value,
+                "Keys Iterator must model RandomAccessIterator");
+  static_assert(type_traits::is_random_access_iterator<ValIter>::value,
+                "Vals Iterator must model RandomAccessIterator");
+  impl::sort::stable_pairs(p, keys_begin, keys_end, vals_begin, comp);
+}
+
+
+// =============================================================================
+
+/*!
+******************************************************************************
+*
+* \brief  sort execution pattern
+*
+* \param[in] p Execution policy
+* \param[in,out] c RandomAccess Container
+*range
+* \param[in] comp comparison function to apply for sort
+*
+******************************************************************************
+*/
+template <typename ExecPolicy,
+          typename Container,
+          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
+concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>,
+                    type_traits::is_range<Container>>
+sort(const ExecPolicy &p,
+     Container &c,
+     Compare comp = Compare{})
+{
+  using T = RAJA::detail::ContainerVal<Container>;
+  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
+                "Compare must model BinaryFunction");
+  static_assert(type_traits::is_random_access_range<Container>::value,
+                "Container must model RandomAccessRange");
+  impl::sort::unstable(p, std::begin(c), std::end(c), comp);
+}
+
+/*!
+******************************************************************************
+*
+* \brief  stable sort execution pattern
+*
+* \param[in] p Execution policy
+* \param[in,out] c RandomAccess Container
+*range
+* \param[in] comp comparison function to apply for stable_sort
+*
+******************************************************************************
+*/
+template <typename ExecPolicy,
+          typename Container,
+          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
+concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>,
+                    type_traits::is_range<Container>>
+stable_sort(const ExecPolicy &p,
+            Container &c,
+            Compare comp = Compare{})
+{
+  using T = RAJA::detail::ContainerVal<Container>;
+  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
+                "Compare must model BinaryFunction");
+  static_assert(type_traits::is_random_access_range<Container>::value,
+                "Container must model RandomAccessRange");
+  impl::sort::stable(p, std::begin(c), std::end(c), comp);
+}
+
+/*!
+******************************************************************************
+*
+* \brief  sort pairs execution pattern
+*
+* \param[in] p Execution policy
+* \param[in,out] keys RandomAccess Container or range of keys to be sorted
+* \param[in,out] values RandomAccess Container or range of values to reorder
+* along with keys
+* \param[in] comp comparison function to apply to keys for sort
+*
+******************************************************************************
+*/
+template <typename ExecPolicy,
+          typename KeyContainer,
+          typename ValContainer,
+          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
+concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>,
+                    type_traits::is_range<KeyContainer>,
+                    type_traits::is_range<ValContainer>>
+sort_pairs(const ExecPolicy &p,
+           KeyContainer &keys,
+           ValContainer &vals,
+           Compare comp = Compare{})
+{
+  using T = RAJA::detail::ContainerVal<KeyContainer>;
+  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
+                "Compare must model BinaryFunction");
+  static_assert(type_traits::is_random_access_range<KeyContainer>::value,
+                "KeyContainer must model RandomAccessRange");
+  static_assert(type_traits::is_random_access_range<ValContainer>::value,
+                "ValContainer must model RandomAccessRange");
+  impl::sort::unstable_pairs(p, std::begin(keys), std::end(keys), std::begin(vals), comp);
+}
+
+/*!
+******************************************************************************
+*
+* \brief  stable sort pairs execution pattern
+*
+* \param[in] p Execution policy
+* \param[in,out] keys RandomAccess KeyContainer or range of keys to be sorted
+* \param[in,out] vals RandomAccess Container or range of values to reorder
+* along with keys
+* \param[in] comp comparison function to apply to keys for stable_sort
+*
+******************************************************************************
+*/
+template <typename ExecPolicy,
+          typename KeyContainer,
+          typename ValContainer,
+          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
+concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>,
+                    type_traits::is_range<KeyContainer>,
+                    type_traits::is_range<ValContainer>>
+stable_sort_pairs(const ExecPolicy &p,
+                  KeyContainer &keys,
+                  ValContainer &vals,
+                  Compare comp = Compare{})
+{
+  using T = RAJA::detail::ContainerVal<KeyContainer>;
+  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
+                "Compare must model BinaryFunction");
+  static_assert(type_traits::is_random_access_range<KeyContainer>::value,
+                "KeyContainer must model RandomAccessRange");
+  static_assert(type_traits::is_random_access_range<ValContainer>::value,
+                "ValContainer must model RandomAccessRange");
+  impl::sort::stable_pairs(p, std::begin(keys), std::end(keys), std::begin(vals), comp);
+}
+
+
+// =============================================================================
+
+template <typename ExecPolicy, typename... Args>
+concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>>
+sort(Args &&... args)
+{
+  sort(ExecPolicy{}, std::forward<Args>(args)...);
+}
+
+template <typename ExecPolicy, typename... Args>
+concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>>
+stable_sort(Args &&... args)
+{
+  stable_sort(ExecPolicy{}, std::forward<Args>(args)...);
+}
+
+template <typename ExecPolicy, typename... Args>
+concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>>
+sort_pairs(Args &&... args)
+{
+  sort_pairs(ExecPolicy{}, std::forward<Args>(args)...);
+}
+
+template <typename ExecPolicy, typename... Args>
+concepts::enable_if<type_traits::is_execution_policy<ExecPolicy>>
+stable_sort_pairs(Args &&... args)
+{
+  stable_sort_pairs(ExecPolicy{}, std::forward<Args>(args)...);
+}
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/teams.hpp b/include/RAJA/pattern/teams.hpp
new file mode 100644
index 0000000000..9c28c74389
--- /dev/null
+++ b/include/RAJA/pattern/teams.hpp
@@ -0,0 +1,40 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file containing headers for RAJA::Teams backends
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_pattern_teams_HPP
+#define RAJA_pattern_teams_HPP
+
+#include "RAJA/pattern/teams/teams_core.hpp"
+
+//
+// All platforms must support loop execution.
+//
+#include "RAJA/pattern/teams/teams_sequential.hpp"
+
+#if (defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))) && defined(RAJA_ENABLE_CUDA)
+#include "RAJA/pattern/teams/teams_cuda.hpp"
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+#include "RAJA/pattern/teams/teams_hip.hpp"
+#endif
+
+#if defined(RAJA_ENABLE_OPENMP)
+#include "RAJA/pattern/teams/teams_openmp.hpp"
+#endif
+
+#endif /* RAJA_pattern_teams_HPP */
diff --git a/include/RAJA/pattern/teams/teams_core.hpp b/include/RAJA/pattern/teams/teams_core.hpp
new file mode 100644
index 0000000000..d40418e228
--- /dev/null
+++ b/include/RAJA/pattern/teams/teams_core.hpp
@@ -0,0 +1,276 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file containing the core components of RAJA::Teams
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_pattern_teams_core_HPP
+#define RAJA_pattern_teams_core_HPP
+
+#include "RAJA/config.hpp"
+#include "RAJA/internal/get_platform.hpp"
+#include "RAJA/util/StaticLayout.hpp"
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/plugins.hpp"
+#include "RAJA/util/types.hpp"
+#include "camp/camp.hpp"
+#include "camp/concepts.hpp"
+#include "camp/tuple.hpp"
+
+#if ((defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))) && \
+     defined(RAJA_ENABLE_CUDA)) ||                                       \
+    defined(RAJA_ENABLE_HIP)
+#define RAJA_ENABLE_DEVICE
+#endif
+
+#if defined(RAJA_DEVICE_CODE)
+#define TEAM_SHARED __shared__
+#else
+#define TEAM_SHARED
+#endif
+
+namespace RAJA
+{
+
+namespace expt
+{
+
+// GPU or CPU threads available
+enum ExecPlace { HOST, DEVICE, NUM_PLACES };
+
+struct null_launch_t {
+};
+
+// Support for host, and device
+template <typename HOST_POLICY
+#if defined(RAJA_ENABLE_DEVICE)
+          ,
+          typename DEVICE_POLICY
+#endif
+          >
+struct LoopPolicy {
+  using host_policy_t = HOST_POLICY;
+#if defined(RAJA_ENABLE_DEVICE)
+  using device_policy_t = DEVICE_POLICY;
+#endif
+};
+
+template <typename HOST_POLICY
+#if defined(RAJA_ENABLE_DEVICE)
+          ,
+          typename DEVICE_POLICY
+#endif
+          >
+struct LaunchPolicy {
+  using host_policy_t = HOST_POLICY;
+#if defined(RAJA_ENABLE_DEVICE)
+  using device_policy_t = DEVICE_POLICY;
+#endif
+};
+
+
+struct Teams {
+  int value[3];
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr Teams() : value{1, 1, 1} {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr Teams(int i) : value{i, 1, 1} {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr Teams(int i, int j) : value{i, j, 1} {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr Teams(int i, int j, int k) : value{i, j, k} {}
+};
+
+struct Threads {
+  int value[3];
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr Threads() : value{1, 1, 1} {}
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr Threads(int i) : value{i, 1, 1} {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr Threads(int i, int j) : value{i, j, 1} {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr Threads(int i, int j, int k) : value{i, j, k} {}
+};
+
+struct Lanes {
+  int value;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr Lanes() : value(0) {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr Lanes(int i) : value(i) {}
+};
+
+struct Resources {
+public:
+  Teams teams;
+  Threads threads;
+  Lanes lanes;
+
+  RAJA_INLINE
+  Resources() = default;
+
+  Resources(Teams in_teams, Threads in_threads)
+      : teams(in_teams), threads(in_threads){};
+
+private:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  Teams apply(Teams const &a) { return (teams = a); }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  Threads apply(Threads const &a) { return (threads = a); }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  Lanes apply(Lanes const &a) { return (lanes = a); }
+};
+
+
+class LaunchContext : public Resources
+{
+public:
+  ExecPlace exec_place;
+
+  LaunchContext(Resources const &base, ExecPlace place)
+      : Resources(base), exec_place(place)
+  {
+  }
+
+
+  RAJA_HOST_DEVICE
+  void teamSync()
+  {
+#if defined(RAJA_DEVICE_CODE)
+    __syncthreads();
+#endif
+  }
+};
+
+
+template <typename LAUNCH_POLICY>
+struct LaunchExecute;
+
+template <typename POLICY_LIST, typename BODY>
+void launch(ExecPlace place, Resources const &team_resources, BODY const &body)
+{
+  switch (place) {
+    case HOST: {
+      using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
+      launch_t::exec(LaunchContext(team_resources, HOST), body);
+      break;
+    }
+#ifdef RAJA_ENABLE_DEVICE
+    case DEVICE: {
+      using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
+      launch_t::exec(LaunchContext(team_resources, DEVICE), body);
+      break;
+    }
+#endif
+    default:
+      RAJA_ABORT_OR_THROW("Unknown launch place or Device is not enabled");
+  }
+}
+
+template <typename POLICY, typename SEGMENT>
+struct LoopExecute;
+
+
+template <typename POLICY_LIST,
+          typename CONTEXT,
+          typename SEGMENT,
+          typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
+                                       SEGMENT const &segment,
+                                       BODY const &body)
+{
+#if defined(RAJA_DEVICE_CODE)
+  LoopExecute<typename POLICY_LIST::device_policy_t, SEGMENT>::exec(ctx,
+                                                                    segment,
+                                                                    body);
+#else
+  LoopExecute<typename POLICY_LIST::host_policy_t, SEGMENT>::exec(ctx,
+                                                                  segment,
+                                                                  body);
+#endif
+}
+
+template <typename POLICY_LIST,
+          typename CONTEXT,
+          typename SEGMENT,
+          typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
+                                       SEGMENT const &segment0,
+                                       SEGMENT const &segment1,
+                                       BODY const &body)
+{
+#if defined(RAJA_DEVICE_CODE)
+  LoopExecute<typename POLICY_LIST::device_policy_t, SEGMENT>::exec(ctx,
+                                                                    segment0,
+                                                                    segment1,
+                                                                    body);
+#else
+  LoopExecute<typename POLICY_LIST::host_policy_t, SEGMENT>::exec(ctx,
+                                                                  segment0,
+                                                                  segment1,
+                                                                  body);
+#endif
+}
+
+template <typename POLICY_LIST,
+          typename CONTEXT,
+          typename SEGMENT,
+          typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
+                                       SEGMENT const &segment0,
+                                       SEGMENT const &segment1,
+                                       SEGMENT const &segment2,
+                                       BODY const &body)
+{
+
+#if defined(RAJA_DEVICE_CODE)
+  LoopExecute<typename POLICY_LIST::device_policy_t, SEGMENT>::exec(
+      ctx, segment0, segment1, segment2, body);
+#else
+  LoopExecute<typename POLICY_LIST::host_policy_t, SEGMENT>::exec(
+      ctx, segment0, segment1, segment2, body);
+#endif
+}
+
+}  // namespace expt
+
+}  // namespace RAJA
+#endif
diff --git a/include/RAJA/pattern/teams/teams_cuda.hpp b/include/RAJA/pattern/teams/teams_cuda.hpp
new file mode 100644
index 0000000000..19139841f6
--- /dev/null
+++ b/include/RAJA/pattern/teams/teams_cuda.hpp
@@ -0,0 +1,541 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file containing user interface for RAJA::Teams::cuda
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_pattern_teams_cuda_HPP
+#define RAJA_pattern_teams_cuda_HPP
+
+#include "RAJA/pattern/teams/teams_core.hpp"
+#include "RAJA/policy/cuda/policy.hpp"
+
+namespace RAJA
+{
+
+namespace expt
+{
+
+template <bool async, int num_threads = 0>
+struct cuda_launch_t {
+};
+
+template <typename BODY>
+__global__ void launch_global_fcn(LaunchContext ctx, BODY body)
+{
+  body(ctx);
+}
+
+
+template <bool async>
+struct LaunchExecute<RAJA::expt::cuda_launch_t<async, 0>> {
+  template <typename BODY>
+  static void exec(LaunchContext const &ctx, BODY const &body)
+  {
+    dim3 blocks;
+    dim3 threads;
+
+    blocks.x = ctx.teams.value[0];
+    blocks.y = ctx.teams.value[1];
+    blocks.z = ctx.teams.value[2];
+
+    threads.x = ctx.threads.value[0];
+    threads.y = ctx.threads.value[1];
+    threads.z = ctx.threads.value[2];
+    launch_global_fcn<<<blocks, threads>>>(ctx, body);
+
+    if (!async) {
+      cudaDeviceSynchronize();
+    }
+  }
+};
+
+
+template <typename BODY, int num_threads>
+__launch_bounds__(num_threads, 1) __global__
+    void launch_global_fcn_fixed(LaunchContext ctx, BODY body)
+{
+  body(ctx);
+}
+
+
+template <bool async, int nthreads>
+struct LaunchExecute<RAJA::expt::cuda_launch_t<async, nthreads>> {
+  template <typename BODY>
+  static void exec(LaunchContext const &ctx, BODY const &body)
+  {
+    dim3 blocks;
+    dim3 threads;
+
+    blocks.x = ctx.teams.value[0];
+    blocks.y = ctx.teams.value[1];
+    blocks.z = ctx.teams.value[2];
+
+    threads.x = ctx.threads.value[0];
+    threads.y = ctx.threads.value[1];
+    threads.z = ctx.threads.value[2];
+    launch_global_fcn_fixed<nthreads><<<blocks, threads>>>(ctx, body);
+
+    if (!async) {
+      cudaDeviceSynchronize();
+    }
+  }
+};
+
+/*
+  CUDA thread loops with block strides
+*/
+
+template <typename SEGMENT>
+struct LoopExecute<cuda_thread_x_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int tx = threadIdx.x; tx < len; tx += blockDim.x) {
+      body(*(segment.begin() + tx));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<cuda_thread_y_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int ty = threadIdx.y; ty < len; ty += blockDim.y) {
+      body(*(segment.begin() + ty));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<cuda_thread_z_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int tz = threadIdx.z; tz < len; tz += blockDim.z) {
+      body(*(segment.begin() + tz));
+    }
+  }
+};
+
+/*
+  CUDA thread direct mappings
+*/
+
+template <typename SEGMENT>
+struct LoopExecute<cuda_thread_x_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int tx = threadIdx.x;
+      if (tx < len) body(*(segment.begin() + tx));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<cuda_thread_y_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int ty = threadIdx.y;
+      if (ty < len) body(*(segment.begin() + ty));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<cuda_thread_z_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int tz = threadIdx.z;
+      if (tz < len) body(*(segment.begin() + tz));
+    }
+  }
+};
+
+/*
+  CUDA block loops with grid strides
+*/
+template <typename SEGMENT>
+struct LoopExecute<cuda_block_x_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int bx = blockIdx.x; bx < len; bx += gridDim.x) {
+      body(*(segment.begin() + bx));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<cuda_block_y_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int by = blockIdx.y; by < len; by += gridDim.y) {
+      body(*(segment.begin() + by));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<cuda_block_z_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int bz = blockIdx.z; bz < len; bz += gridDim.z) {
+      body(*(segment.begin() + bz));
+    }
+  }
+};
+
+/*
+  CUDA block direct mappings
+*/
+
+template <typename SEGMENT>
+struct LoopExecute<cuda_block_x_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int bx = blockIdx.x;
+      if (bx < len) body(*(segment.begin() + bx));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<cuda_block_y_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int by = blockIdx.y;
+      if (by < len) body(*(segment.begin() + by));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<cuda_block_z_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int bz = blockIdx.z;
+      if (bz < len) body(*(segment.begin() + bz));
+    }
+  }
+};
+
+
+// perfectly nested cuda direct policies
+struct cuda_block_xy_nested_direct;
+struct cuda_block_xyz_nested_direct;
+
+struct cuda_thread_xy_nested_direct;
+struct cuda_thread_xyz_nested_direct;
+
+
+template <typename SEGMENT>
+struct LoopExecute<cuda_block_xy_nested_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx = blockIdx.x;
+      const int ty = blockIdx.y;
+      if (tx < len0 && ty < len1)
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<cuda_thread_xy_nested_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx = threadIdx.x;
+      const int ty = threadIdx.y;
+      if (tx < len0 && ty < len1)
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty));
+    }
+  }
+};
+
+
+template <typename SEGMENT>
+struct LoopExecute<cuda_block_xyz_nested_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx = blockIdx.x;
+      const int ty = blockIdx.y;
+      const int tz = blockIdx.z;
+      if (tx < len0 && ty < len1 && tz < len2)
+        body(*(segment0.begin() + tx),
+             *(segment1.begin() + ty),
+             *(segment2.begin() + tz));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<cuda_thread_xyz_nested_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx = threadIdx.x;
+      const int ty = threadIdx.y;
+      const int tz = threadIdx.z;
+      if (tx < len0 && ty < len1 && tz < len2)
+        body(*(segment0.begin() + tx),
+             *(segment1.begin() + ty),
+             *(segment2.begin() + tz));
+    }
+  }
+};
+
+
+// perfectly nested cuda loop policies
+struct cuda_block_xy_nested_loop;
+struct cuda_block_xyz_nested_loop;
+
+struct cuda_thread_xy_nested_loop;
+struct cuda_thread_xyz_nested_loop;
+
+template <typename SEGMENT>
+struct LoopExecute<cuda_block_xy_nested_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      for (int by = blockIdx.y; by < len1; by += gridDim.y) {
+        for (int bx = blockIdx.x; bx < len0; bx += gridDim.x) {
+          body(*(segment0.begin() + bx), *(segment1.begin() + by));
+        }
+      }
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<cuda_thread_xy_nested_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      for (int ty = threadIdx.y; ty < len1; ty += blockDim.y) {
+        for (int tx = threadIdx.x; tx < len0; tx += blockDim.x) {
+          body(*(segment0.begin() + tx), *(segment1.begin() + ty));
+        }
+      }
+    }
+  }
+};
+
+
+template <typename SEGMENT>
+struct LoopExecute<cuda_block_xyz_nested_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+
+    for (int bz = blockIdx.z; bz < len2; bz += gridDim.z) {
+      for (int by = blockIdx.y; by < len1; by += gridDim.y) {
+        for (int bx = blockIdx.x; bx < len0; bx += gridDim.x) {
+          body(*(segment0.begin() + bx),
+               *(segment1.begin() + by),
+               *(segment2.begin() + bz));
+        }
+      }
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<cuda_thread_xyz_nested_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+
+    for (int bz = threadIdx.z; bz < len2; bz += blockDim.z) {
+      for (int by = threadIdx.y; by < len1; by += blockDim.y) {
+        for (int bx = threadIdx.x; bx < len0; bx += blockDim.x) {
+          body(*(segment0.begin() + bx),
+               *(segment1.begin() + by),
+               *(segment2.begin() + bz));
+        }
+      }
+    }
+  }
+};
+
+}  // namespace expt
+
+}  // namespace RAJA
+#endif
diff --git a/include/RAJA/pattern/teams/teams_hip.hpp b/include/RAJA/pattern/teams/teams_hip.hpp
new file mode 100644
index 0000000000..5ba52d097d
--- /dev/null
+++ b/include/RAJA/pattern/teams/teams_hip.hpp
@@ -0,0 +1,541 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file containing user interface for RAJA::Teams::hip
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_pattern_teams_hip_HPP
+#define RAJA_pattern_teams_hip_HPP
+
+#include "RAJA/pattern/teams/teams_core.hpp"
+#include "RAJA/policy/hip/policy.hpp"
+
+namespace RAJA
+{
+
+namespace expt
+{
+
+template <bool async, int num_threads = 0>
+struct hip_launch_t {
+};
+
+template <typename BODY>
+__global__ void launch_global_fcn(LaunchContext ctx, BODY body)
+{
+  body(ctx);
+}
+
+
+template <bool async>
+struct LaunchExecute<RAJA::expt::hip_launch_t<async, 0>> {
+  template <typename BODY>
+  static void exec(LaunchContext const &ctx, BODY const &body)
+  {
+    dim3 blocks;
+    dim3 threads;
+
+    blocks.x = ctx.teams.value[0];
+    blocks.y = ctx.teams.value[1];
+    blocks.z = ctx.teams.value[2];
+
+    threads.x = ctx.threads.value[0];
+    threads.y = ctx.threads.value[1];
+    threads.z = ctx.threads.value[2];
+    launch_global_fcn<<<blocks, threads>>>(ctx, body);
+
+    if (!async) {
+      hipDeviceSynchronize();
+    }
+  }
+};
+
+
+template <typename BODY, int num_threads>
+__launch_bounds__(num_threads, 1) __global__
+    void launch_global_fcn_fixed(LaunchContext ctx, BODY body)
+{
+  body(ctx);
+}
+
+
+template <bool async, int nthreads>
+struct LaunchExecute<RAJA::expt::hip_launch_t<async, nthreads>> {
+  template <typename BODY>
+  static void exec(LaunchContext const &ctx, BODY const &body)
+  {
+    dim3 blocks;
+    dim3 threads;
+
+    blocks.x = ctx.teams.value[0];
+    blocks.y = ctx.teams.value[1];
+    blocks.z = ctx.teams.value[2];
+
+    threads.x = ctx.threads.value[0];
+    threads.y = ctx.threads.value[1];
+    threads.z = ctx.threads.value[2];
+    launch_global_fcn_fixed<nthreads><<<blocks, threads>>>(ctx, body);
+
+    if (!async) {
+      hipDeviceSynchronize();
+    }
+  }
+};
+
+/*
+  HIP thread loops with block strides
+*/
+
+template <typename SEGMENT>
+struct LoopExecute<hip_thread_x_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int tx = threadIdx.x; tx < len; tx += blockDim.x) {
+      body(*(segment.begin() + tx));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<hip_thread_y_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int ty = threadIdx.y; ty < len; ty += blockDim.y) {
+      body(*(segment.begin() + ty));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<hip_thread_z_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int tz = threadIdx.z; tz < len; tz += blockDim.z) {
+      body(*(segment.begin() + tz));
+    }
+  }
+};
+
+/*
+  HIP thread direct mappings
+*/
+
+template <typename SEGMENT>
+struct LoopExecute<hip_thread_x_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int tx = threadIdx.x;
+      if (tx < len) body(*(segment.begin() + tx));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<hip_thread_y_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int ty = threadIdx.y;
+      if (ty < len) body(*(segment.begin() + ty));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<hip_thread_z_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int tz = threadIdx.z;
+      if (tz < len) body(*(segment.begin() + tz));
+    }
+  }
+};
+
+/*
+  HIP block loops with grid strides
+*/
+template <typename SEGMENT>
+struct LoopExecute<hip_block_x_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int bx = blockIdx.x; bx < len; bx += gridDim.x) {
+      body(*(segment.begin() + bx));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<hip_block_y_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int by = blockIdx.y; by < len; by += gridDim.y) {
+      body(*(segment.begin() + by));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<hip_block_z_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int bz = blockIdx.z; bz < len; bz += gridDim.z) {
+      body(*(segment.begin() + bz));
+    }
+  }
+};
+
+/*
+  HIP block direct mappings
+*/
+
+template <typename SEGMENT>
+struct LoopExecute<hip_block_x_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int bx = blockIdx.x;
+      if (bx < len) body(*(segment.begin() + bx));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<hip_block_y_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int by = blockIdx.y;
+      if (by < len) body(*(segment.begin() + by));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<hip_block_z_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int bz = blockIdx.z;
+      if (bz < len) body(*(segment.begin() + bz));
+    }
+  }
+};
+
+
+// perfectly nested hip direct policies
+struct hip_block_xy_nested_direct;
+struct hip_block_xyz_nested_direct;
+
+struct hip_thread_xy_nested_direct;
+struct hip_thread_xyz_nested_direct;
+
+
+template <typename SEGMENT>
+struct LoopExecute<hip_block_xy_nested_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx = blockIdx.x;
+      const int ty = blockIdx.y;
+      if (tx < len0 && ty < len1)
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<hip_thread_xy_nested_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx = threadIdx.x;
+      const int ty = threadIdx.y;
+      if (tx < len0 && ty < len1)
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty));
+    }
+  }
+};
+
+
+template <typename SEGMENT>
+struct LoopExecute<hip_block_xyz_nested_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx = blockIdx.x;
+      const int ty = blockIdx.y;
+      const int tz = blockIdx.z;
+      if (tx < len0 && ty < len1 && tz < len2)
+        body(*(segment0.begin() + tx),
+             *(segment1.begin() + ty),
+             *(segment2.begin() + tz));
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<hip_thread_xyz_nested_direct, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx = threadIdx.x;
+      const int ty = threadIdx.y;
+      const int tz = threadIdx.z;
+      if (tx < len0 && ty < len1 && tz < len2)
+        body(*(segment0.begin() + tx),
+             *(segment1.begin() + ty),
+             *(segment2.begin() + tz));
+    }
+  }
+};
+
+// perfectly nested hip loop policies
+struct hip_block_xy_nested_loop;
+struct hip_block_xyz_nested_loop;
+
+struct hip_thread_xy_nested_loop;
+struct hip_thread_xyz_nested_loop;
+
+template <typename SEGMENT>
+struct LoopExecute<hip_block_xy_nested_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      for (int by = blockIdx.y; by < len1; by += gridDim.y) {
+        for (int bx = blockIdx.x; bx < len0; bx += gridDim.x) {
+          body(*(segment0.begin() + bx), *(segment1.begin() + by));
+        }
+      }
+    }
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<hip_thread_xy_nested_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      for (int ty = threadIdx.y; ty < len1; ty += blockDim.y) {
+        for (int tx = threadIdx.x; tx < len0; tx += blockDim.x) {
+          body(*(segment0.begin() + tx), *(segment1.begin() + ty));
+        }
+      }
+    }
+  }
+};
+
+
+template <typename SEGMENT>
+struct LoopExecute<hip_block_xyz_nested_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+
+    for (int bz = blockIdx.z; bz < len2; bz += gridDim.z) {
+      for (int by = blockIdx.y; by < len1; by += gridDim.y) {
+        for (int bx = blockIdx.x; bx < len0; bx += gridDim.x) {
+          body(*(segment0.begin() + bx),
+               *(segment1.begin() + by),
+               *(segment2.begin() + bz));
+        }
+      }
+    }
+  }
+};
+
+
+template <typename SEGMENT>
+struct LoopExecute<hip_thread_xyz_nested_loop, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+
+    for (int bz = threadIdx.z; bz < len2; bz += blockDim.z) {
+      for (int by = threadIdx.y; by < len1; by += blockDim.y) {
+        for (int bx = threadIdx.x; bx < len0; bx += blockDim.x) {
+          body(*(segment0.begin() + bx),
+               *(segment1.begin() + by),
+               *(segment2.begin() + bz));
+        }
+      }
+    }
+  }
+};
+
+}  // namespace expt
+
+}  // namespace RAJA
+#endif
diff --git a/include/RAJA/pattern/teams/teams_openmp.hpp b/include/RAJA/pattern/teams/teams_openmp.hpp
new file mode 100644
index 0000000000..74683ada26
--- /dev/null
+++ b/include/RAJA/pattern/teams/teams_openmp.hpp
@@ -0,0 +1,162 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file containing user interface for RAJA::Teams::openmp
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_pattern_teams_openmp_HPP
+#define RAJA_pattern_teams_openmp_HPP
+
+#include "RAJA/pattern/teams/teams_core.hpp"
+#include "RAJA/policy/openmp/policy.hpp"
+
+namespace RAJA
+{
+
+namespace expt
+{
+
+struct omp_launch_t {
+};
+
+template <>
+struct LaunchExecute<RAJA::expt::omp_launch_t> {
+  template <typename BODY>
+  static void exec(LaunchContext const &ctx, BODY const &body)
+  {
+    body(ctx);
+  }
+};
+
+
+template <typename SEGMENT>
+struct LoopExecute<omp_parallel_for_exec, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    int len = segment.end() - segment.begin();
+#pragma omp parallel for
+    for (int i = 0; i < len; i++) {
+
+      body(*(segment.begin() + i));
+    }
+  }
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+
+#pragma omp parallel for RAJA_COLLAPSE(2)
+    for (int j = 0; j < len1; j++) {
+      for (int i = 0; i < len0; i++) {
+
+        body(*(segment0.begin() + i), *(segment1.begin() + j));
+      }
+    }
+  }
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+
+#pragma omp parallel for RAJA_COLLAPSE(3)
+    for (int k = 0; k < len2; k++) {
+      for (int j = 0; j < len1; j++) {
+        for (int i = 0; i < len0; i++) {
+          body(*(segment0.begin() + i),
+               *(segment1.begin() + j),
+               *(segment2.begin() + k));
+        }
+      }
+    }
+  }
+};
+
+// policy for perfectly nested loops
+struct omp_parallel_nested_for_exec;
+
+template <typename SEGMENT>
+struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+
+#pragma omp parallel for RAJA_COLLAPSE(2)
+    for (int j = 0; j < len1; j++) {
+      for (int i = 0; i < len0; i++) {
+
+        body(*(segment0.begin() + i), *(segment1.begin() + j));
+      }
+    }
+  }
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+
+#pragma omp parallel for RAJA_COLLAPSE(3)
+    for (int k = 0; k < len2; k++) {
+      for (int j = 0; j < len1; j++) {
+        for (int i = 0; i < len0; i++) {
+          body(*(segment0.begin() + i),
+               *(segment1.begin() + j),
+               *(segment2.begin() + k));
+        }
+      }
+    }
+  }
+};
+
+}  // namespace expt
+
+}  // namespace RAJA
+#endif
diff --git a/include/RAJA/pattern/teams/teams_sequential.hpp b/include/RAJA/pattern/teams/teams_sequential.hpp
new file mode 100644
index 0000000000..d2cfa267d9
--- /dev/null
+++ b/include/RAJA/pattern/teams/teams_sequential.hpp
@@ -0,0 +1,121 @@
+
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file containing user interface for RAJA::Teams::seq
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_pattern_teams_sequential_HPP
+#define RAJA_pattern_teams_sequential_HPP
+
+#include "RAJA/pattern/teams/teams_core.hpp"
+#include "RAJA/policy/loop/policy.hpp"
+
+
+namespace RAJA
+{
+
+namespace expt
+{
+
+struct seq_launch_t {
+};
+
+template <>
+struct LaunchExecute<RAJA::expt::null_launch_t> {
+  template <typename BODY>
+  static void exec(LaunchContext const &ctx, BODY const &body)
+  {
+    RAJA_ABORT_OR_THROW("NULL Launch");
+  }
+};
+
+
+template <>
+struct LaunchExecute<RAJA::expt::seq_launch_t> {
+  template <typename BODY>
+  static void exec(LaunchContext const &ctx, BODY const &body)
+  {
+    body(ctx);
+  }
+};
+
+template <typename SEGMENT>
+struct LoopExecute<loop_exec, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    // block stride loop
+    const int len = segment.end() - segment.begin();
+    for (int i = 0; i < len; i++) {
+
+      body(*(segment.begin() + i));
+    }
+  }
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+
+    // block stride loop
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+
+    for (int j = 0; j < len1; j++) {
+      for (int i = 0; i < len0; i++) {
+
+        body(*(segment0.begin() + i), *(segment1.begin() + j));
+      }
+    }
+  }
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+
+    // block stride loop
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+
+    for (int k = 0; k < len2; k++) {
+      for (int j = 0; j < len1; j++) {
+        for (int i = 0; i < len0; i++) {
+          body(*(segment0.begin() + i),
+               *(segment1.begin() + j),
+               *(segment2.begin() + k));
+        }
+      }
+    }
+  }
+};
+
+}  // namespace expt
+
+}  // namespace RAJA
+#endif
diff --git a/include/RAJA/policy/MultiPolicy.hpp b/include/RAJA/policy/MultiPolicy.hpp
index c141f5b243..07c1f4b65e 100644
--- a/include/RAJA/policy/MultiPolicy.hpp
+++ b/include/RAJA/policy/MultiPolicy.hpp
@@ -22,14 +22,14 @@
 
 #include <tuple>
 
-#include "RAJA/internal/LegacyCompatibility.hpp"
-
 #include "RAJA/policy/PolicyBase.hpp"
 
 #include "RAJA/internal/get_platform.hpp"
 #include "RAJA/util/plugins.hpp"
 
 #include "RAJA/util/concepts.hpp"
+#include "RAJA/util/resource.hpp"
+
 
 namespace RAJA
 {
@@ -92,6 +92,19 @@ RAJA_INLINE void forall_impl(MultiPolicy<Selector, Policies...> p,
 {
   p.invoke(iter, body);
 }
+template <typename Res,
+          typename Iterable,
+          typename Body,
+          typename Selector,
+          typename... Policies>
+RAJA_INLINE resources::EventProxy<Res> forall_impl(Res &r,
+                                  MultiPolicy<Selector, Policies...> p,
+                                  Iterable &&iter,
+                                  Body &&body)
+{
+  p.invoke(iter, body);
+  return resources::EventProxy<Res>(&r);
+}
 
 }  // end namespace multi
 }  // end namespace policy
@@ -100,8 +113,9 @@ using policy::multi::MultiPolicy;
 
 namespace detail
 {
-template <size_t... Indices, typename... Policies, typename Selector>
-auto make_multi_policy(VarOps::index_sequence<Indices...>,
+
+template <camp::idx_t... Indices, typename... Policies, typename Selector>
+auto make_multi_policy(camp::idx_seq<Indices...>,
                        Selector s,
                        std::tuple<Policies...> policies)
     -> MultiPolicy<Selector, Policies...>
@@ -119,6 +133,7 @@ auto make_multi_policy(VarOps::index_sequence<Indices...>,
 /// forall, must return an int in the set 0 to N-1 selecting the policy to use
 /// \return A MultiPolicy containing the given selector s
 template <typename... Policies, typename Selector>
+RAJA_DEPRECATE("In the next RAJA Release, MultiPolicy will be deprecated.")
 auto make_multi_policy(Selector s) -> MultiPolicy<Selector, Policies...>
 {
   return MultiPolicy<Selector, Policies...>(s, Policies{}...);
@@ -134,11 +149,12 @@ auto make_multi_policy(Selector s) -> MultiPolicy<Selector, Policies...>
 /// forall, must return an int in the set 0 to N-1 selecting the policy to use
 /// \return A MultiPolicy containing the given selector s
 template <typename... Policies, typename Selector>
+RAJA_DEPRECATE("In the next RAJA Release, MultiPolicy will be deprecated.")
 auto make_multi_policy(std::tuple<Policies...> policies, Selector s)
     -> MultiPolicy<Selector, Policies...>
 {
   return detail::make_multi_policy(
-      VarOps::make_index_sequence<sizeof...(Policies)>{}, s, policies);
+      camp::make_idx_seq_t<sizeof...(Policies)>{}, s, policies);
 }
 
 namespace detail
@@ -152,14 +168,29 @@ struct policy_invoker : public policy_invoker<index - 1, size, rest...> {
 
   policy_invoker(Policy p, rest... args) : NextInvoker(args...), _p(p) {}
 
-  template <typename Iterable, typename Body>
-  void invoke(int offset, Iterable &&iter, Body &&body)
+  template <typename Iterable, typename LoopBody>
+  void invoke(int offset, Iterable &&iter, LoopBody &&loop_body)
   {
     if (offset == size - index - 1) {
+
+      util::PluginContext context{util::make_context<Policy>()};
+      util::callPreCapturePlugins(context);
+
+      using RAJA::util::trigger_updates_before;
+      auto body = trigger_updates_before(loop_body);
+
+      util::callPostCapturePlugins(context);
+
+      util::callPreLaunchPlugins(context);
+
       using policy::multi::forall_impl;
-      forall_impl(_p, iter, body);
+      RAJA_FORCEINLINE_RECURSIVE
+      auto r = resources::get_resource<Policy>::type::get_default();
+      forall_impl(r, _p, std::forward<Iterable>(iter), body);
+
+      util::callPostLaunchPlugins(context);
     } else {
-      NextInvoker::invoke(offset, iter, body);
+      NextInvoker::invoke(offset, std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
     }
   }
 };
@@ -168,15 +199,26 @@ template <size_t size, typename Policy, typename... rest>
 struct policy_invoker<0, size, Policy, rest...> {
   Policy _p;
   policy_invoker(Policy p, rest...) : _p(p) {}
-  template <typename Iterable, typename Body>
-  void invoke(int offset, Iterable &&iter, Body &&body)
+  template <typename Iterable, typename LoopBody>
+  void invoke(int offset, Iterable &&iter, LoopBody &&loop_body)
   {
     if (offset == size - 1) {
+
       util::PluginContext context{util::make_context<Policy>()};
-      util::callPreLaunchPlugins(context); 
+      util::callPreCapturePlugins(context);
+
+      using RAJA::util::trigger_updates_before;
+      auto body = trigger_updates_before(loop_body);
 
+      util::callPostCapturePlugins(context);
+
+      util::callPreLaunchPlugins(context);
+
+      //std::cout <<"policy_invoker: No index\n";
       using policy::multi::forall_impl;
-      forall_impl(_p, iter, body);
+      RAJA_FORCEINLINE_RECURSIVE
+      auto r = resources::get_resource<Policy>::type::get_default();
+      forall_impl(r, _p, std::forward<Iterable>(iter), body);
 
       util::callPostLaunchPlugins(context);
     } else {
@@ -187,6 +229,15 @@ struct policy_invoker<0, size, Policy, rest...> {
 
 }  // end namespace detail
 
+namespace type_traits
+{
+
+template <typename T>
+struct is_multi_policy
+    : ::RAJA::type_traits::SpecializationOf<RAJA::MultiPolicy, typename std::decay<T>::type> {
+};
+}  // namespace type_traits
+
 }  // end namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/PolicyBase.hpp b/include/RAJA/policy/PolicyBase.hpp
index 77f9a23cb7..5f4560b5c5 100644
--- a/include/RAJA/policy/PolicyBase.hpp
+++ b/include/RAJA/policy/PolicyBase.hpp
@@ -18,6 +18,7 @@
 #ifndef RAJA_POLICYBASE_HPP
 #define RAJA_POLICYBASE_HPP
 
+#include "RAJA/util/camp_aliases.hpp"
 #include "RAJA/util/concepts.hpp"
 
 #include <cstddef>
@@ -43,13 +44,15 @@ enum class Pattern {
   region,
   reduce,
   taskgraph,
-  synchronize
+  synchronize,
+  workgroup,
+  workgroup_exec,
+  workgroup_order,
+  workgroup_storage
 };
 
 enum class Launch { undefined, sync, async };
 
-enum class Platform { undefined = 0, host = 1, cuda = 2, omp_target = 4, hip = 8 };
-
 struct PolicyBase {
 };
 
@@ -89,6 +92,10 @@ template <typename PolicyType, RAJA::Policy P_>
 struct policy_is : camp::num<policy_of<camp::decay<PolicyType>>::value == P_> {
 };
 
+template <typename PolicyType, RAJA::Policy ... Ps_>
+struct policy_any_of : camp::num<camp::concepts::any_of<policy_is<PolicyType, Ps_>...>::value> {
+};
+
 template <typename PolicyType, RAJA::Pattern P_>
 struct pattern_is
     : camp::num<pattern_of<camp::decay<PolicyType>>::value == P_> {
@@ -129,10 +136,20 @@ template <Policy Policy_,
 using make_policy_pattern_launch_platform_t =
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Args...>;
 
-template <Policy Policy_, Pattern Pattern_, Launch Launch_, typename... Args>
+template <Policy Policy_,
+          Pattern Pattern_,
+          Launch Launch_,
+          typename... Args>
 using make_policy_pattern_launch_t =
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform::undefined, Args...>;
 
+template <Policy Policy_,
+          Pattern Pattern_,
+          Platform Platform_,
+          typename... Args>
+using make_policy_pattern_platform_t =
+    PolicyBaseT<Policy_, Pattern_, Launch::undefined, Platform_, Args...>;
+
 namespace concepts
 {
 
@@ -179,6 +196,11 @@ template <typename Pol>
 struct is_hip_policy : RAJA::policy_is<Pol, RAJA::Policy::hip> {
 };
 
+template <typename Pol>
+struct is_device_exec_policy
+    : RAJA::policy_any_of<Pol, RAJA::Policy::cuda, RAJA::Policy::hip> {
+};
+
 DefineTypeTraitFromConcept(is_execution_policy,
                            RAJA::concepts::ExecutionPolicy);
 
diff --git a/include/RAJA/policy/WorkGroup.hpp b/include/RAJA/policy/WorkGroup.hpp
new file mode 100644
index 0000000000..9c4cebd169
--- /dev/null
+++ b/include/RAJA/policy/WorkGroup.hpp
@@ -0,0 +1,90 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA wrapper for "multi-policy" and dynamic policy selection
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_Policy_WorkGroup_HPP
+#define RAJA_Policy_WorkGroup_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/PolicyBase.hpp"
+
+#include "RAJA/internal/get_platform.hpp"
+#include "RAJA/util/plugins.hpp"
+
+#include "RAJA/util/concepts.hpp"
+
+namespace RAJA
+{
+
+namespace policy
+{
+namespace workgroup
+{
+
+struct ordered
+    : RAJA::make_policy_pattern_t<Policy::undefined,
+                                  Pattern::workgroup_order> {
+};
+struct reverse_ordered
+    : RAJA::make_policy_pattern_t<Policy::undefined,
+                                  Pattern::workgroup_order> {
+};
+
+struct array_of_pointers
+    : RAJA::make_policy_pattern_t<Policy::undefined,
+                                  Pattern::workgroup_storage> {
+};
+struct ragged_array_of_objects
+    : RAJA::make_policy_pattern_t<Policy::undefined,
+                                  Pattern::workgroup_storage> {
+};
+struct constant_stride_array_of_objects
+    : RAJA::make_policy_pattern_t<Policy::undefined,
+                                  Pattern::workgroup_storage> {
+};
+
+template < typename EXEC_POLICY_T,
+           typename ORDER_POLICY_T,
+           typename STORAGE_POLICY_T >
+struct WorkGroupPolicy
+    : public RAJA::make_policy_pattern_platform_t<
+                       policy_of<EXEC_POLICY_T>::value,
+                       Pattern::workgroup,
+                       platform_of<EXEC_POLICY_T>::value> {
+  static_assert(RAJA::pattern_is<EXEC_POLICY_T, RAJA::Pattern::workgroup_exec>::value,
+      "WorkGroupPolicy: EXEC_POLICY_T must be a workgroup exec policy");
+  static_assert(RAJA::pattern_is<ORDER_POLICY_T, RAJA::Pattern::workgroup_order>::value,
+      "WorkGroupPolicy: ORDER_POLICY_T must be a workgroup order policy");
+  static_assert(RAJA::pattern_is<STORAGE_POLICY_T, RAJA::Pattern::workgroup_storage>::value,
+      "WorkGroupPolicy: STORAGE_POLICY_T must be a workgroup storage policy");
+};
+
+}  // end namespace workgroup
+}  // end namespace policy
+
+using policy::workgroup::ordered;
+using policy::workgroup::reverse_ordered;
+
+using policy::workgroup::array_of_pointers;
+using policy::workgroup::ragged_array_of_objects;
+using policy::workgroup::constant_stride_array_of_objects;
+
+using policy::workgroup::WorkGroupPolicy;
+
+}  // end namespace RAJA
+
+#endif
diff --git a/include/RAJA/policy/atomic_builtin.hpp b/include/RAJA/policy/atomic_builtin.hpp
index f485aaeae7..bdedfc6e12 100644
--- a/include/RAJA/policy/atomic_builtin.hpp
+++ b/include/RAJA/policy/atomic_builtin.hpp
@@ -40,13 +40,12 @@ struct builtin_atomic {
 namespace detail
 {
 
-#if defined(RAJA_COMPILER_MSVC)
+#if defined(RAJA_COMPILER_MSVC) || (defined(_WIN32) && defined(__INTEL_COMPILER))
 
 RAJA_DEVICE_HIP
-RAJA_INLINE unsigned builtin_atomic_CAS(
-                               unsigned volatile *acc,
-                               unsigned compare,
-                               unsigned value)
+RAJA_INLINE unsigned builtin_atomic_CAS(unsigned volatile *acc,
+                                        unsigned compare,
+                                        unsigned value)
 {
 
   long long_value = RAJA::util::reinterp_A_as_B<unsigned, long>(value);
@@ -59,9 +58,9 @@ RAJA_INLINE unsigned builtin_atomic_CAS(
 
 RAJA_DEVICE_HIP
 RAJA_INLINE unsigned long long builtin_atomic_CAS(
-                                         unsigned long long volatile *acc,
-                                         unsigned long long compare,
-                                         unsigned long long value)
+    unsigned long long volatile *acc,
+    unsigned long long compare,
+    unsigned long long value)
 {
 
   long long long_value =
@@ -76,13 +75,12 @@ RAJA_INLINE unsigned long long builtin_atomic_CAS(
   return RAJA::util::reinterp_A_as_B<long long, unsigned long long>(old);
 }
 
-#else   // RAJA_COMPILER_MSVC
+#else  // RAJA_COMPILER_MSVC
 
 RAJA_DEVICE_HIP
-RAJA_INLINE unsigned builtin_atomic_CAS(
-                              unsigned volatile *acc,
-                              unsigned compare,
-                              unsigned value)
+RAJA_INLINE unsigned builtin_atomic_CAS(unsigned volatile *acc,
+                                        unsigned compare,
+                                        unsigned value)
 {
   __atomic_compare_exchange_n(
       acc, &compare, value, false, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED);
@@ -91,9 +89,9 @@ RAJA_INLINE unsigned builtin_atomic_CAS(
 
 RAJA_DEVICE_HIP
 RAJA_INLINE unsigned long long builtin_atomic_CAS(
-                              unsigned long long volatile *acc,
-                              unsigned long long compare,
-                              unsigned long long value)
+    unsigned long long volatile *acc,
+    unsigned long long compare,
+    unsigned long long value)
 {
   __atomic_compare_exchange_n(
       acc, &compare, value, false, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED);
@@ -104,25 +102,25 @@ RAJA_INLINE unsigned long long builtin_atomic_CAS(
 
 
 template <typename T>
-RAJA_DEVICE_HIP
-RAJA_INLINE typename std::enable_if<sizeof(T) == sizeof(unsigned), T>::type
-builtin_atomic_CAS(T volatile *acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE
+    typename std::enable_if<sizeof(T) == sizeof(unsigned), T>::type
+    builtin_atomic_CAS(T volatile *acc, T compare, T value)
 {
   return RAJA::util::reinterp_A_as_B<unsigned, T>(
       builtin_atomic_CAS((unsigned volatile *)acc,
-          RAJA::util::reinterp_A_as_B<T, unsigned>(compare),
-          RAJA::util::reinterp_A_as_B<T, unsigned>(value)));
+                         RAJA::util::reinterp_A_as_B<T, unsigned>(compare),
+                         RAJA::util::reinterp_A_as_B<T, unsigned>(value)));
 }
 
 template <typename T>
-RAJA_DEVICE_HIP
-RAJA_INLINE typename std::enable_if<sizeof(T) == sizeof(unsigned long long), T>::type
-builtin_atomic_CAS(T volatile *acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE
+    typename std::enable_if<sizeof(T) == sizeof(unsigned long long), T>::type
+    builtin_atomic_CAS(T volatile *acc, T compare, T value)
 {
-  return RAJA::util::reinterp_A_as_B<unsigned long long, T>(
-      builtin_atomic_CAS((unsigned long long volatile *)acc,
-          RAJA::util::reinterp_A_as_B<T, unsigned long long>(compare),
-          RAJA::util::reinterp_A_as_B<T, unsigned long long>(value)));
+  return RAJA::util::reinterp_A_as_B<unsigned long long, T>(builtin_atomic_CAS(
+      (unsigned long long volatile *)acc,
+      RAJA::util::reinterp_A_as_B<T, unsigned long long>(compare),
+      RAJA::util::reinterp_A_as_B<T, unsigned long long>(value)));
 }
 
 
@@ -144,19 +142,21 @@ struct BuiltinAtomicCAS<4> {
    * Returns the OLD value that was replaced by the result of this operation.
    */
   template <typename T, typename OPER, typename ShortCircuit>
-  RAJA_DEVICE_HIP
-  RAJA_INLINE T operator()(T volatile *acc,
-                           OPER const &oper,
-                           ShortCircuit const &sc) const
+  RAJA_DEVICE_HIP RAJA_INLINE T operator()(T volatile *acc,
+                                           OPER const &oper,
+                                           ShortCircuit const &sc) const
   {
+#ifdef RAJA_COMPILER_MSVC
+#pragma warning( disable : 4244 )  // Force msvc to not emit conversion warning
+#endif
     unsigned oldval, newval, readback;
 
     oldval = RAJA::util::reinterp_A_as_B<T, unsigned>(*acc);
     newval = RAJA::util::reinterp_A_as_B<T, unsigned>(
         oper(RAJA::util::reinterp_A_as_B<unsigned, T>(oldval)));
 
-    while ((readback = builtin_atomic_CAS(
-                (unsigned *)acc, oldval, newval)) != oldval) {
+    while ((readback = builtin_atomic_CAS((unsigned *)acc, oldval, newval)) !=
+           oldval) {
       if (sc(readback)) break;
       oldval = readback;
       newval = RAJA::util::reinterp_A_as_B<T, unsigned>(
@@ -164,6 +164,9 @@ struct BuiltinAtomicCAS<4> {
     }
     return RAJA::util::reinterp_A_as_B<unsigned, T>(oldval);
   }
+#ifdef RAJA_COMPILER_MSVC
+#pragma warning( default : 4244 )  // Reenable warning
+#endif
 };
 
 template <>
@@ -175,19 +178,22 @@ struct BuiltinAtomicCAS<8> {
    * Returns the OLD value that was replaced by the result of this operation.
    */
   template <typename T, typename OPER, typename ShortCircuit>
-  RAJA_DEVICE_HIP
-  RAJA_INLINE T operator()(T volatile *acc,
-                           OPER const &oper,
-                           ShortCircuit const &sc) const
+  RAJA_DEVICE_HIP RAJA_INLINE T operator()(T volatile *acc,
+                                           OPER const &oper,
+                                           ShortCircuit const &sc) const
   {
+#ifdef RAJA_COMPILER_MSVC
+#pragma warning( disable : 4244 )  // Force msvc to not emit conversion warning
+#endif
     unsigned long long oldval, newval, readback;
 
     oldval = RAJA::util::reinterp_A_as_B<T, unsigned long long>(*acc);
     newval = RAJA::util::reinterp_A_as_B<T, unsigned long long>(
         oper(RAJA::util::reinterp_A_as_B<unsigned long long, T>(oldval)));
 
-    while ((readback = builtin_atomic_CAS(
-                (unsigned long long *)acc, oldval, newval)) != oldval) {
+    while ((readback = builtin_atomic_CAS((unsigned long long *)acc,
+                                          oldval,
+                                          newval)) != oldval) {
       if (sc(readback)) break;
       oldval = readback;
       newval = RAJA::util::reinterp_A_as_B<T, unsigned long long>(
@@ -195,6 +201,11 @@ struct BuiltinAtomicCAS<8> {
     }
     return RAJA::util::reinterp_A_as_B<unsigned long long, T>(oldval);
   }
+
+#ifdef RAJA_COMPILER_MSVC
+#pragma warning( default : 4244 )  // Reenable warning
+#endif
+
 };
 
 
@@ -205,18 +216,17 @@ struct BuiltinAtomicCAS<8> {
  * Returns the OLD value that was replaced by the result of this operation.
  */
 template <typename T, typename OPER>
-RAJA_DEVICE_HIP
-RAJA_INLINE T builtin_atomic_CAS_oper(T volatile *acc, OPER &&oper)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomic_CAS_oper(T volatile *acc,
+                                                      OPER &&oper)
 {
   BuiltinAtomicCAS<sizeof(T)> cas;
   return cas(acc, std::forward<OPER>(oper), [](T const &) { return false; });
 }
 
 template <typename T, typename OPER, typename ShortCircuit>
-RAJA_DEVICE_HIP
-RAJA_INLINE T builtin_atomic_CAS_oper_sc(T volatile *acc,
-                                         OPER &&oper,
-                                         ShortCircuit const &sc)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomic_CAS_oper_sc(T volatile *acc,
+                                                         OPER &&oper,
+                                                         ShortCircuit const &sc)
 {
   BuiltinAtomicCAS<sizeof(T)> cas;
   return cas(acc, std::forward<OPER>(oper), sc);
@@ -227,23 +237,26 @@ RAJA_INLINE T builtin_atomic_CAS_oper_sc(T volatile *acc,
 
 
 template <typename T>
-RAJA_DEVICE_HIP
-RAJA_INLINE T atomicAdd(builtin_atomic, T volatile *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicAdd(builtin_atomic,
+                                        T volatile *acc,
+                                        T value)
 {
   return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a + value; });
 }
 
 
 template <typename T>
-RAJA_DEVICE_HIP
-RAJA_INLINE T atomicSub(builtin_atomic, T volatile *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicSub(builtin_atomic,
+                                        T volatile *acc,
+                                        T value)
 {
   return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a - value; });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP
-RAJA_INLINE T atomicMin(builtin_atomic, T volatile *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic,
+                                        T volatile *acc,
+                                        T value)
 {
   if (*acc < value) {
     return *acc;
@@ -258,8 +271,9 @@ RAJA_INLINE T atomicMin(builtin_atomic, T volatile *acc, T value)
 }
 
 template <typename T>
-RAJA_DEVICE_HIP
-RAJA_INLINE T atomicMax(builtin_atomic, T volatile *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic,
+                                        T volatile *acc,
+                                        T value)
 {
   if (*acc > value) {
     return *acc;
@@ -274,15 +288,13 @@ RAJA_INLINE T atomicMax(builtin_atomic, T volatile *acc, T value)
 }
 
 template <typename T>
-RAJA_DEVICE_HIP
-RAJA_INLINE T atomicInc(builtin_atomic, T volatile *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T volatile *acc)
 {
   return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a + 1; });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP
-RAJA_INLINE T atomicInc(builtin_atomic, T volatile *acc, T val)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T volatile *acc, T val)
 {
   return detail::builtin_atomic_CAS_oper(acc, [=](T old) {
     return ((old >= val) ? 0 : (old + 1));
@@ -290,15 +302,13 @@ RAJA_INLINE T atomicInc(builtin_atomic, T volatile *acc, T val)
 }
 
 template <typename T>
-RAJA_DEVICE_HIP
-RAJA_INLINE T atomicDec(builtin_atomic, T volatile *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T volatile *acc)
 {
   return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a - 1; });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP
-RAJA_INLINE T atomicDec(builtin_atomic, T volatile *acc, T val)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T volatile *acc, T val)
 {
   return detail::builtin_atomic_CAS_oper(acc, [=](T old) {
     return (((old == 0) | (old > val)) ? val : (old - 1));
@@ -306,36 +316,38 @@ RAJA_INLINE T atomicDec(builtin_atomic, T volatile *acc, T val)
 }
 
 template <typename T>
-RAJA_DEVICE_HIP
-RAJA_INLINE T atomicAnd(builtin_atomic, T volatile *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicAnd(builtin_atomic,
+                                        T volatile *acc,
+                                        T value)
 {
   return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a & value; });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP
-RAJA_INLINE T atomicOr(builtin_atomic, T volatile *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicOr(builtin_atomic, T volatile *acc, T value)
 {
   return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a | value; });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP
-RAJA_INLINE T atomicXor(builtin_atomic, T volatile *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicXor(builtin_atomic,
+                                        T volatile *acc,
+                                        T value)
 {
   return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a ^ value; });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP
-RAJA_INLINE T atomicExchange(builtin_atomic, T volatile *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic,
+                                             T volatile *acc,
+                                             T value)
 {
   return detail::builtin_atomic_CAS_oper(acc, [=](T) { return value; });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP
-RAJA_INLINE T atomicCAS(builtin_atomic, T volatile *acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T
+atomicCAS(builtin_atomic, T volatile *acc, T compare, T value)
 {
   return detail::builtin_atomic_CAS(acc, compare, value);
 }
diff --git a/include/RAJA/policy/cuda.hpp b/include/RAJA/policy/cuda.hpp
index 88584cf037..370b1462e6 100644
--- a/include/RAJA/policy/cuda.hpp
+++ b/include/RAJA/policy/cuda.hpp
@@ -32,8 +32,10 @@
 #include "RAJA/policy/cuda/policy.hpp"
 #include "RAJA/policy/cuda/reduce.hpp"
 #include "RAJA/policy/cuda/scan.hpp"
+#include "RAJA/policy/cuda/sort.hpp"
 #include "RAJA/policy/cuda/kernel.hpp"
 #include "RAJA/policy/cuda/synchronize.hpp"
+#include "RAJA/policy/cuda/WorkGroup.hpp"
 
 #endif  // closing endif for if defined(RAJA_ENABLE_CUDA)
 
diff --git a/include/RAJA/policy/cuda/WorkGroup.hpp b/include/RAJA/policy/cuda/WorkGroup.hpp
new file mode 100644
index 0000000000..81681e5ed1
--- /dev/null
+++ b/include/RAJA/policy/cuda/WorkGroup.hpp
@@ -0,0 +1,24 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA Vtable and WorkRunner constructs.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_cuda_WorkGroup_HPP
+#define RAJA_cuda_WorkGroup_HPP
+
+#include "RAJA/policy/cuda/WorkGroup/Vtable.hpp"
+#include "RAJA/policy/cuda/WorkGroup/WorkRunner.hpp"
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp b/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp
new file mode 100644
index 0000000000..d253084466
--- /dev/null
+++ b/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp
@@ -0,0 +1,117 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA workgroup Vtable.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_cuda_WorkGroup_Vtable_HPP
+#define RAJA_cuda_WorkGroup_Vtable_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/cuda/policy.hpp"
+
+#include "RAJA/pattern/WorkGroup/Vtable.hpp"
+
+#include <thread>
+#include <mutex>
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+// global function that gets the device function pointer and
+// writes it into a pinned ptrptr
+template < typename T, typename Vtable_T >
+__global__ void get_Vtable_cuda_device_call_global(
+    typename Vtable_T::call_sig* ptrptr)
+{
+  *ptrptr = &Vtable_T::template device_call<T>;
+}
+
+// allocate the pinned ptrptr buffer
+inline void* get_Vtable_cuda_device_call_ptrptr()
+{
+  void* ptrptr = nullptr;
+  cudaErrchk(cudaMallocHost(&ptrptr, sizeof(typename Vtable<>::call_sig)));
+  return ptrptr;
+}
+
+// get the pinned ptrptr buffer
+inline void* get_cached_Vtable_cuda_device_call_ptrptr()
+{
+  static void* ptrptr = get_Vtable_cuda_device_call_ptrptr();
+  return ptrptr;
+}
+
+// mutex that guards against concurrent use of
+// get_cached_Vtable_cuda_device_call_ptrptr()
+inline std::mutex& get_Vtable_cuda_mutex()
+{
+  static std::mutex s_mutex;
+  return s_mutex;
+}
+
+// get the device function pointer by calling a global function to
+// write it into a pinned ptrptr, beware different instantiates of this
+// function may run concurrently
+template < typename T, typename Vtable_T >
+inline typename Vtable_T::call_sig get_Vtable_cuda_device_call()
+{
+  const std::lock_guard<std::mutex> lock(get_Vtable_cuda_mutex());
+
+  typename Vtable_T::call_sig* ptrptr =
+      static_cast<typename Vtable_T::call_sig*>(
+        get_cached_Vtable_cuda_device_call_ptrptr());
+  get_Vtable_cuda_device_call_global<T, Vtable_T><<<1,1>>>(ptrptr);
+  cudaErrchk(cudaGetLastError());
+  cudaErrchk(cudaDeviceSynchronize());
+
+  return *ptrptr;
+}
+
+// get the device function pointer and store it so it can be used
+// multiple times
+template < typename T, typename Vtable_T >
+inline typename Vtable_T::call_sig get_cached_Vtable_cuda_device_call()
+{
+  static typename Vtable_T::call_sig ptr =
+      get_Vtable_cuda_device_call<T, Vtable_T>();
+  return ptr;
+}
+
+/*!
+* Populate and return a Vtable object where the
+* call operator is a device function
+*/
+template < typename T, typename Vtable_T, size_t BLOCK_SIZE, bool Async >
+inline const Vtable_T* get_Vtable(cuda_work<BLOCK_SIZE, Async> const&)
+{
+  static Vtable_T vtable{
+        &Vtable_T::template move_construct_destroy<T>,
+        get_cached_Vtable_cuda_device_call<T, Vtable_T>(),
+        &Vtable_T::template destroy<T>,
+        sizeof(T)
+      };
+  return &vtable;
+}
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
new file mode 100644
index 0000000000..66ba9deae8
--- /dev/null
+++ b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
@@ -0,0 +1,354 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA WorkRunner class specializations.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_cuda_WorkGroup_WorkRunner_HPP
+#define RAJA_cuda_WorkGroup_WorkRunner_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/cuda/policy.hpp"
+#include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
+
+#include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+ * Runs work in a storage container in order
+ * and returns any per run resources
+ */
+template <size_t BLOCK_SIZE, bool Async,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::cuda_work<BLOCK_SIZE, Async>,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallOrdered<
+        RAJA::cuda_exec_async<BLOCK_SIZE>,
+        RAJA::cuda_work<BLOCK_SIZE, Async>,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{
+  using base = WorkRunnerForallOrdered<
+        RAJA::cuda_exec_async<BLOCK_SIZE>,
+        RAJA::cuda_work<BLOCK_SIZE, Async>,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>;
+  using base::base;
+  using IndexType = INDEX_T;
+  using per_run_storage = typename base::per_run_storage;
+
+  ///
+  /// run the loops in the given work container in order using forall
+  /// run all loops asynchronously and synchronize after is necessary
+  ///
+  template < typename WorkContainer >
+  per_run_storage run(WorkContainer const& storage, Args... args) const
+  {
+    per_run_storage run_storage =
+        base::run(storage, std::forward<Args>(args)...);
+
+    IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
+
+    // Only synchronize if we had something to iterate over
+    if (num_loops > 0 && BLOCK_SIZE > 0) {
+      cudaStream_t stream = 0; // TODO: coordinate with base to use same stream
+      if (!Async) { RAJA::cuda::synchronize(stream); }
+    }
+
+    return run_storage;
+  }
+};
+
+/*!
+ * Runs work in a storage container in reverse order
+ * and returns any per run resources
+ */
+template <size_t BLOCK_SIZE, bool Async,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::cuda_work<BLOCK_SIZE, Async>,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallReverse<
+        RAJA::cuda_exec_async<BLOCK_SIZE>,
+        RAJA::cuda_work<BLOCK_SIZE, Async>,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{
+  using base = WorkRunnerForallReverse<
+        RAJA::cuda_exec_async<BLOCK_SIZE>,
+        RAJA::cuda_work<BLOCK_SIZE, Async>,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>;
+  using base::base;
+  using IndexType = INDEX_T;
+  using per_run_storage = typename base::per_run_storage;
+
+  ///
+  /// run the loops in the given work container in reverse order using forall
+  /// run all loops asynchronously and synchronize after is necessary
+  ///
+  template < typename WorkContainer >
+  per_run_storage run(WorkContainer const& storage, Args... args) const
+  {
+    per_run_storage run_storage =
+        base::run(storage, std::forward<Args>(args)...);
+
+    IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
+
+    // Only synchronize if we had something to iterate over
+    if (num_loops > 0 && BLOCK_SIZE > 0) {
+      cudaStream_t stream = 0; // TODO: coordinate with base to use same stream
+      if (!Async) { RAJA::cuda::synchronize(stream); }
+    }
+
+    return run_storage;
+  }
+};
+
+
+/*!
+ * A body and segment holder for storing loops that will be executed
+ * on the device
+ */
+template <typename Segment_type, typename LoopBody,
+          typename index_type, typename ... Args>
+struct HoldCudaDeviceXThreadblockLoop
+{
+  template < typename segment_in, typename body_in >
+  HoldCudaDeviceXThreadblockLoop(segment_in&& segment, body_in&& body)
+    : m_segment(std::forward<segment_in>(segment))
+    , m_body(std::forward<body_in>(body))
+  { }
+
+  RAJA_DEVICE RAJA_INLINE void operator()(Args... args) const
+  {
+    // TODO:: decide when to run hooks, may bypass this and use impl directly
+    // TODO:: decide whether or not to privatize the loop body
+    const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
+    const index_type stride  = blockDim.x * gridDim.x;
+    const auto begin = m_segment.begin();
+    const auto end   = m_segment.end();
+    const index_type len(end - begin);
+    for ( index_type i = i_begin; i < len; i += stride ) {
+      m_body(begin[i], std::forward<Args>(args)...);
+    }
+  }
+
+private:
+  Segment_type m_segment;
+  LoopBody m_body;
+};
+
+template < size_t BLOCK_SIZE,
+           typename StorageIter,
+           typename value_type,
+           typename index_type,
+           typename ... Args >
+__launch_bounds__(BLOCK_SIZE, 1) __global__
+    void cuda_unordered_y_block_global(StorageIter iter, Args... args)
+{
+  const index_type i_loop = blockIdx.y;
+  // TODO: cache pointer to value_type in shared memory
+  // TODO: cache holder (value_type::obj) in shared memory
+  value_type::call(&iter[i_loop], args...);
+}
+
+
+/*!
+ * Runs work in a storage container out of order with loops mapping to
+ * cuda blocks in the y direction and iterations mapping to threads in
+ * the x direction, with the number of threads in the x dimension determined
+ * by the average number of iterates per loop
+ */
+template <size_t BLOCK_SIZE, bool Async,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::cuda_work<BLOCK_SIZE, Async>,
+        RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{
+  using exec_policy = RAJA::cuda_work<BLOCK_SIZE, Async>;
+  using order_policy = RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
+  using Allocator = ALLOCATOR_T;
+  using index_type = INDEX_T;
+
+  using vtable_type = Vtable<Args...>;
+
+  WorkRunner() = default;
+
+  WorkRunner(WorkRunner const&) = delete;
+  WorkRunner& operator=(WorkRunner const&) = delete;
+
+  WorkRunner(WorkRunner && o)
+    : m_total_iterations(o.m_total_iterations)
+  {
+    o.m_total_iterations = 0;
+  }
+  WorkRunner& operator=(WorkRunner && o)
+  {
+    m_total_iterations = o.m_total_iterations;
+
+    o.m_total_iterations = 0;
+    return *this;
+  }
+
+  // The type  that will hold the segment and loop body in work storage
+  template < typename ITERABLE, typename LOOP_BODY >
+  using holder_type = HoldCudaDeviceXThreadblockLoop<ITERABLE, LOOP_BODY,
+                                 index_type, Args...>;
+
+  // The policy indicating where the call function is invoked
+  // in this case the values are called on the device
+  using vtable_exec_policy = exec_policy;
+
+  // runner interfaces with storage to enqueue so the runner can get
+  // information from the segment and loop at enqueue time
+  template < typename WorkContainer, typename Iterable, typename LoopBody >
+  inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
+  {
+    using Iterator  = camp::decay<decltype(std::begin(iter))>;
+    using LOOP_BODY = camp::decay<LoopBody>;
+    using ITERABLE  = camp::decay<Iterable>;
+    using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+
+    using holder = holder_type<ITERABLE, LOOP_BODY>;
+
+    // using true_value_type = typename WorkContainer::template true_value_type<holder>;
+
+    Iterator begin = std::begin(iter);
+    Iterator end = std::end(iter);
+    IndexType len = std::distance(begin, end);
+
+    // Only launch kernel if we have something to iterate over
+    if (len > 0 && BLOCK_SIZE > 0) {
+
+      m_total_iterations += len;
+
+      //
+      // TODO: Privatize the loop_body, using make_launch_body to setup reductions
+      //
+      // LOOP_BODY body = RAJA::cuda::make_launch_body(
+      //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
+
+      storage.template emplace<holder>(
+          get_Vtable<holder, vtable_type>(vtable_exec_policy{}),
+          std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
+    }
+  }
+
+  // no extra storage required here
+  using per_run_storage = int;
+
+  template < typename WorkContainer >
+  per_run_storage run(WorkContainer const& storage, Args... args) const
+  {
+    using Iterator  = camp::decay<decltype(std::begin(storage))>;
+    using IndexType = camp::decay<decltype(std::distance(std::begin(storage), std::end(storage)))>;
+    using value_type = typename WorkContainer::value_type;
+
+    per_run_storage run_storage{};
+
+    auto func = cuda_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type, index_type, Args...>;
+
+    //
+    // Compute the requested iteration space size
+    //
+    Iterator begin = std::begin(storage);
+    Iterator end = std::end(storage);
+    IndexType num_loops = std::distance(begin, end);
+
+    // Only launch kernel if we have something to iterate over
+    if (num_loops > 0 && BLOCK_SIZE > 0) {
+
+      index_type average_iterations = m_total_iterations / static_cast<index_type>(num_loops);
+
+      //
+      // Compute the number of blocks
+      //
+      constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
+      cuda_dim_t blockSize{static_cast<cuda_dim_member_t>(block_size), 1, 1};
+      cuda_dim_t gridSize{static_cast<cuda_dim_member_t>((average_iterations + block_size - 1) / block_size),
+                          static_cast<cuda_dim_member_t>(num_loops),
+                          1};
+
+      RAJA_FT_BEGIN;
+
+      //
+      // Setup shared memory buffers
+      //
+      size_t shmem = 0;
+      cudaStream_t stream = 0;
+
+      {
+        //
+        // Launch the kernel
+        //
+        void* func_args[] = { (void*)&begin, (void*)&args... };
+        RAJA::cuda::launch((const void*)func, gridSize, blockSize, func_args, shmem, stream);
+      }
+
+      if (!Async) { RAJA::cuda::synchronize(stream); }
+
+      RAJA_FT_END;
+    }
+
+    return run_storage;
+  }
+
+  // clear any state so ready to be destroyed or reused
+  void clear()
+  {
+    m_total_iterations = 0;
+  }
+
+private:
+  index_type m_total_iterations = 0;
+};
+
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/atomic.hpp b/include/RAJA/policy/cuda/atomic.hpp
index b8a0670d40..7b7fc1b66d 100644
--- a/include/RAJA/policy/cuda/atomic.hpp
+++ b/include/RAJA/policy/cuda/atomic.hpp
@@ -507,8 +507,38 @@ RAJA_INLINE __device__ unsigned long long cuda_atomicXor<unsigned long long>(
 template <typename T>
 RAJA_INLINE __device__ T cuda_atomicExchange(T volatile *acc, T value)
 {
-  // attempt to use the CUDA builtin atomic, if it exists for T
-  return ::atomicExch((T *)acc, value);
+  return cuda_atomic_CAS_oper(acc, [=] __device__(T) {
+    return value;
+  });
+}
+
+template <>
+RAJA_INLINE __device__ int cuda_atomicExchange<int>(
+    int volatile *acc, int value)
+{
+  return ::atomicExch((int *)acc, value);
+}
+
+template <>
+RAJA_INLINE __device__ unsigned cuda_atomicExchange<unsigned>(
+    unsigned volatile *acc, unsigned value)
+{
+  return ::atomicExch((unsigned *)acc, value);
+}
+
+template <>
+RAJA_INLINE __device__ unsigned long long cuda_atomicExchange<unsigned long long>(
+    unsigned long long volatile *acc,
+    unsigned long long value)
+{
+  return ::atomicExch((unsigned long long *)acc, value);
+}
+
+template <>
+RAJA_INLINE __device__ float cuda_atomicExchange<float>(
+    float volatile *acc, float value)
+{
+  return ::atomicExch((float *)acc, value);
 }
 #endif
 
diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index 7cc9609be8..dab47d1468 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -41,6 +41,8 @@
 
 #include "RAJA/index/IndexSet.hpp"
 
+#include "RAJA/util/resource.hpp"
+
 namespace RAJA
 {
 
@@ -156,9 +158,10 @@ __launch_bounds__(BlockSize, 1) __global__
 //
 
 template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async>
-RAJA_INLINE void forall_impl(cuda_exec<BlockSize, Async>,
-                             Iterable&& iter,
-                             LoopBody&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(resources::Cuda &cuda_res,
+                                                    cuda_exec<BlockSize, Async>,
+                                                    Iterable&& iter,
+                                                    LoopBody&& loop_body)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
@@ -166,6 +169,8 @@ RAJA_INLINE void forall_impl(cuda_exec<BlockSize, Async>,
 
   auto func = impl::forall_cuda_kernel<BlockSize, Iterator, LOOP_BODY, IndexType>;
 
+  cudaStream_t stream = cuda_res.get_stream();
+
   //
   // Compute the requested iteration space size
   //
@@ -188,8 +193,6 @@ RAJA_INLINE void forall_impl(cuda_exec<BlockSize, Async>,
     // Setup shared memory buffers
     //
     size_t shmem = 0;
-    cudaStream_t stream = 0;
-
 
     //  printf("gridsize = (%d,%d), blocksize = %d\n",
     //         (int)gridSize.x,
@@ -203,7 +206,6 @@ RAJA_INLINE void forall_impl(cuda_exec<BlockSize, Async>,
       LOOP_BODY body = RAJA::cuda::make_launch_body(
           gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
 
-
       //
       // Launch the kernels
       //
@@ -215,6 +217,8 @@ RAJA_INLINE void forall_impl(cuda_exec<BlockSize, Async>,
 
     RAJA_FT_END;
   }
+
+  return resources::EventProxy<resources::Cuda>(&cuda_res);
 }
 
 
@@ -240,19 +244,22 @@ template <typename LoopBody,
           size_t BlockSize,
           bool Async,
           typename... SegmentTypes>
-RAJA_INLINE void forall_impl(ExecPolicy<seq_segit, cuda_exec<BlockSize, Async>>,
-                             const TypedIndexSet<SegmentTypes...>& iset,
-                             LoopBody&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(resources::Cuda &r,
+                                                    ExecPolicy<seq_segit, cuda_exec<BlockSize, Async>>,
+                                                    const TypedIndexSet<SegmentTypes...>& iset,
+                                                    LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
   for (int isi = 0; isi < num_seg; ++isi) {
-    iset.segmentCall(isi,
+    iset.segmentCall(r,
+                     isi,
                      detail::CallForall(),
                      cuda_exec<BlockSize, true>(),
                      loop_body);
   }  // iterate over segments of index set
 
   if (!Async) RAJA::cuda::synchronize();
+  return resources::EventProxy<resources::Cuda>(&r);
 }
 
 }  // namespace cuda
diff --git a/include/RAJA/policy/cuda/kernel/Conditional.hpp b/include/RAJA/policy/cuda/kernel/Conditional.hpp
index 8dae4686f5..fa0ad63cb7 100644
--- a/include/RAJA/policy/cuda/kernel/Conditional.hpp
+++ b/include/RAJA/policy/cuda/kernel/Conditional.hpp
@@ -38,12 +38,14 @@ namespace internal
 
 template <typename Data,
           typename Conditional,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<Data,
-                             statement::If<Conditional, EnclosedStmts...>> {
+                             statement::If<Conditional, EnclosedStmts...>,
+                             Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
-  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t>;
+  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
   static
diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
index d0b7327e86..122aca25a0 100644
--- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
+++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -38,8 +38,6 @@
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 #include "RAJA/policy/cuda/policy.hpp"
 
-#include "RAJA/internal/LegacyCompatibility.hpp"
-
 #include "RAJA/policy/cuda/kernel/internal.hpp"
 
 namespace RAJA
@@ -243,7 +241,7 @@ struct CudaKernelLauncherGetter<0, Data, executor_t>
  * Helper class that handles CUDA kernel launching, and computing
  * maximum number of threads/blocks
  */
-template<typename LaunchPolicy, typename StmtList, typename Data>
+template<typename LaunchPolicy, typename StmtList, typename Data, typename Types>
 struct CudaLaunchHelper;
 
 
@@ -252,14 +250,14 @@ struct CudaLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the CUDA occupancy calculator.
  */
-template<bool async0, size_t num_blocks, size_t num_threads, typename StmtList, typename Data>
-struct CudaLaunchHelper<cuda_launch<async0, num_blocks, num_threads>,StmtList,Data>
+template<bool async0, size_t num_blocks, size_t num_threads, typename StmtList, typename Data, typename Types>
+struct CudaLaunchHelper<cuda_launch<async0, num_blocks, num_threads>,StmtList,Data,Types>
 {
   using Self = CudaLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::cuda_statement_list_executor_t<StmtList, Data>;
+  using executor_t = internal::cuda_statement_list_executor_t<StmtList, Data, Types>;
 
   using kernelGetter_t = CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, Data, executor_t>;
 
@@ -451,9 +449,9 @@ cuda_dim_t fitCudaDims(size_t limit, cuda_dim_t result, cuda_dim_t minimum = cud
 /*!
  * Specialization that launches CUDA kernels for RAJA::kernel from host code
  */
-template <typename LaunchConfig, typename... EnclosedStmts>
+template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>> {
+    statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using StatementType =
@@ -464,8 +462,8 @@ struct StatementExecutor<
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t = cuda_statement_list_executor_t<stmt_list_t, data_t>;
-    using launch_t = CudaLaunchHelper<LaunchConfig, stmt_list_t, data_t>;
+    using executor_t = cuda_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using launch_t = CudaLaunchHelper<LaunchConfig, stmt_list_t, data_t, Types>;
 
 
     //
@@ -558,9 +556,10 @@ struct StatementExecutor<
       //
       // make sure that we fit
       //
+      /* Doesn't make sense to check this anymore - AJK
       if(launch_dims.num_blocks() > max_blocks){
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num blocks");
-      }
+      }*/
       if(launch_dims.num_threads() > max_threads){
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num threads");
       }
diff --git a/include/RAJA/policy/cuda/kernel/For.hpp b/include/RAJA/policy/cuda/kernel/For.hpp
index 9c42240174..af9720fe56 100644
--- a/include/RAJA/policy/cuda/kernel/For.hpp
+++ b/include/RAJA/policy/cuda/kernel/For.hpp
@@ -15,6 +15,7 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+
 #ifndef RAJA_policy_cuda_kernel_For_HPP
 #define RAJA_policy_cuda_kernel_For_HPP
 
@@ -38,24 +39,29 @@ namespace internal
 template <typename Data,
           camp::idx_t ArgumentId,
           int ThreadDim,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_thread_xyz_direct<ThreadDim>, EnclosedStmts...>> {
+    statement::For<ArgumentId, RAJA::cuda_thread_xyz_direct<ThreadDim>, EnclosedStmts...>, Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-      CudaStatementListExecutor<Data, stmt_list_t>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
   static
   inline
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
-    auto i = get_cuda_dim<ThreadDim>(threadIdx);
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i = get_cuda_dim<ThreadDim>(threadIdx);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -69,7 +75,7 @@ struct CudaStatementExecutor<
   inline
   LaunchDims calculateDimensions(Data const &data)
   {
-    auto len = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
 
     // request one thread per element in the segment
     LaunchDims dims;
@@ -92,15 +98,22 @@ struct CudaStatementExecutor<
  */
 template <typename Data,
           camp::idx_t ArgumentId,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_warp_direct, EnclosedStmts...>> {
+    statement::For<ArgumentId, RAJA::cuda_warp_direct, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-      CudaStatementListExecutor<Data, stmt_list_t>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
 
   static
@@ -108,8 +121,8 @@ struct CudaStatementExecutor<
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
-    auto i = threadIdx.x;
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i = threadIdx.x;
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -127,7 +140,7 @@ struct CudaStatementExecutor<
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
 
     // we always get EXACTLY one warp by allocating one warp in the X dimension
-    int len = RAJA::policy::cuda::WARP_SIZE;
+    diff_t len = RAJA::policy::cuda::WARP_SIZE;
 
     // request one thread per element in the segment
     set_cuda_dim<0>(dims.threads, len);
@@ -150,40 +163,45 @@ template <typename Data,
           camp::idx_t ArgumentId,
           int ThreadDim,
           int MinThreads,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_thread_xyz_loop<ThreadDim, MinThreads>, EnclosedStmts...>> {
+    statement::For<ArgumentId, RAJA::cuda_thread_xyz_loop<ThreadDim, MinThreads>, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-      CudaStatementListExecutor<Data, stmt_list_t>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
 
   static
   inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // block stride loop
-    auto len = segment_length<ArgumentId>(data);
-    auto i0 = get_cuda_dim<ThreadDim>(threadIdx);
-    auto i_stride = get_cuda_dim<ThreadDim>(blockDim);
-    auto i = i0;
-    for(;i < len;i += i_stride){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = get_cuda_dim<ThreadDim>(threadIdx);
+    diff_t i_stride = get_cuda_dim<ThreadDim>(blockDim);
+
+    // Iterate through block stride of chunks
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
 
       // execute enclosed statements
-      enclosed_stmts_t::exec(data, thread_active);
-    }
-    // do we need one more masked iteration?
-    if(i - i0 < len)
-    {
-      // execute enclosed statements one more time, but masking them off
-      // this is because there's at least one thread that isn't masked off
-      // that is still executing the above loop
-      enclosed_stmts_t::exec(data, false);
+      enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
 
@@ -192,7 +210,7 @@ struct CudaStatementExecutor<
   inline
   LaunchDims calculateDimensions(Data const &data)
   {
-    auto len = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
 
     // request one thread per element in the segment
     LaunchDims dims;
@@ -216,40 +234,45 @@ struct CudaStatementExecutor<
  */
 template <typename Data,
           camp::idx_t ArgumentId,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_warp_loop, EnclosedStmts...>> {
+    statement::For<ArgumentId, RAJA::cuda_warp_loop, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-      CudaStatementListExecutor<Data, stmt_list_t>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
 
   static
   inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // block stride loop
-    auto len = segment_length<ArgumentId>(data);
-    auto i0 = threadIdx.x;
-    auto i_stride = RAJA::policy::cuda::WARP_SIZE;
-    auto i = i0;
-    for(;i < len;i += i_stride){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = threadIdx.x;
+    diff_t i_stride = RAJA::policy::cuda::WARP_SIZE;
+
+    // Iterate through grid stride of chunks
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
 
       // execute enclosed statements
-      enclosed_stmts_t::exec(data, thread_active);
-    }
-    // do we need one more masked iteration?
-    if(i - i0 < len)
-    {
-      // execute enclosed statements one more time, but masking them off
-      // this is because there's at least one thread that isn't masked off
-      // that is still executing the above loop
-      enclosed_stmts_t::exec(data, false);
+      enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
 
@@ -262,7 +285,7 @@ struct CudaStatementExecutor<
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
 
     // we always get EXACTLY one warp by allocating one warp in the X dimension
-    int len = RAJA::policy::cuda::WARP_SIZE;
+    diff_t len = RAJA::policy::cuda::WARP_SIZE;
 
     // request one thread per element in the segment
     set_cuda_dim<0>(dims.threads, len);
@@ -283,19 +306,26 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
   Data,
   statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
-                 EnclosedStmts ...> > {
+                 EnclosedStmts ...>,
+  Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t>;
+          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
+  using diff_t = segment_diff_type<ArgumentId, Data>;
+
   static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
@@ -304,9 +334,9 @@ struct CudaStatementExecutor<
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
 
-    auto i = mask_t::maskValue(threadIdx.x);
+    diff_t i = mask_t::maskValue(threadIdx.x);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -325,7 +355,7 @@ struct CudaStatementExecutor<
 
     // we always get EXACTLY one warp by allocating one warp in the X
     // dimension
-    int len = RAJA::policy::cuda::WARP_SIZE;
+    diff_t len = RAJA::policy::cuda::WARP_SIZE;
 
     // request one thread per element in the segment
     set_cuda_dim<0>(dims.threads, len);
@@ -347,19 +377,26 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
   Data,
   statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
-                 EnclosedStmts ...> > {
+                 EnclosedStmts ...>,
+  Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t>;
+          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
+  using diff_t = segment_diff_type<ArgumentId, Data>;
+
   static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
@@ -369,24 +406,24 @@ struct CudaStatementExecutor<
   void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
-    int len = segment_length<ArgumentId>(data);
-    int i = mask_t::maskValue(threadIdx.x);
-    for( ; i < len; i += (int) mask_t::max_masked_size){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = mask_t::maskValue(threadIdx.x);
+    diff_t i_stride = (diff_t) mask_t::max_masked_size;
+
+    // Iterate through grid stride of chunks
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
 
       // execute enclosed statements
-      enclosed_stmts_t::exec(data, thread_active);
+      enclosed_stmts_t::exec(data, thread_active && have_work);
     }
-    // do we need one more masked iteration?
-    if(i - mask_t::maskValue(threadIdx.x) < len){
-      // execute enclosed statements one more time, but masking them off
-      // this is because there's at least one thread that isn't masked off
-      // that is still executing the above loop
-      enclosed_stmts_t::exec(data, false);
-    }
-
   }
 
 
@@ -399,7 +436,7 @@ struct CudaStatementExecutor<
 
     // we always get EXACTLY one warp by allocating one warp in the X
     // dimension
-    int len = RAJA::policy::cuda::WARP_SIZE;
+    diff_t len = RAJA::policy::cuda::WARP_SIZE;
 
     // request one thread per element in the segment
     set_cuda_dim<0>(dims.threads, len);
@@ -420,27 +457,34 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
   Data,
   statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
-                 EnclosedStmts ...> > {
+                 EnclosedStmts ...>,
+  Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t>;
+          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
+  using diff_t = segment_diff_type<ArgumentId, Data>;
+
   static
   inline
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
 
-    auto i = mask_t::maskValue(threadIdx.x);
+    diff_t i = mask_t::maskValue(threadIdx.x);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -459,7 +503,7 @@ struct CudaStatementExecutor<
 
     // we need to allocate enough threads for the segment size, and the
     // shifted off bits
-    int len = mask_t::max_input_size;
+    diff_t len = mask_t::max_input_size;
 
     // request one thread per element in the segment
     set_cuda_dim<0>(dims.threads, len);
@@ -483,19 +527,26 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
   Data,
   statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
-                 EnclosedStmts ...> > {
+                 EnclosedStmts ...>,
+  Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t>;
+          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
+  using diff_t = segment_diff_type<ArgumentId, Data>;
+
 
   static
   inline
@@ -503,24 +554,24 @@ struct CudaStatementExecutor<
   void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
-    int len = segment_length<ArgumentId>(data);
-    int i = mask_t::maskValue(threadIdx.x);
-    for( ; i < len; i += (int) mask_t::max_masked_size){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = mask_t::maskValue(threadIdx.x);
+    diff_t i_stride = (diff_t) mask_t::max_masked_size;
+
+    // Iterate through grid stride of chunks
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
 
       // execute enclosed statements
-      enclosed_stmts_t::exec(data, thread_active);
-    }
-    // do we need one more masked iteration?
-    if(i - mask_t::maskValue(threadIdx.x) < len){
-      // execute enclosed statements one more time, but masking them off
-      // this is because there's at least one thread that isn't masked off
-      // that is still executing the above loop
-      enclosed_stmts_t::exec(data, false);
+      enclosed_stmts_t::exec(data, thread_active && have_work);
     }
-
   }
 
 
@@ -533,7 +584,7 @@ struct CudaStatementExecutor<
 
     // we need to allocate enough threads for the segment size, and the
     // shifted off bits
-    int len = mask_t::max_input_size;
+    diff_t len = mask_t::max_input_size;
 
     // request one thread per element in the segment
     set_cuda_dim<0>(dims.threads, len);
@@ -549,32 +600,37 @@ struct CudaStatementExecutor<
 
 /*
  * Executor for block work sharing inside CudaKernel.
- * Provides a grid-stride loop (stride of gridDim.xyz) for
- * each block in xyz.
+ * Mapping directly from blockIdx.xyz to indicies
  * Assigns the loop index to offset ArgumentId
  */
 template <typename Data,
           camp::idx_t ArgumentId,
           int BlockDim,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_block_xyz_loop<BlockDim>, EnclosedStmts...>> {
+    statement::For<ArgumentId, RAJA::cuda_block_xyz_direct<BlockDim>, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-      CudaStatementListExecutor<Data, stmt_list_t>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
 
   static
   inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
-    // grid stride loop
-    auto len = segment_length<ArgumentId>(data);
-    auto i0 = get_cuda_dim<BlockDim>(blockIdx);
-    auto i_stride = get_cuda_dim<BlockDim>(gridDim);
-    for(auto i = i0;i < len;i += i_stride){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i = get_cuda_dim<BlockDim>(blockIdx);
+
+    if (i < len) {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -589,12 +645,15 @@ struct CudaStatementExecutor<
   inline
   LaunchDims calculateDimensions(Data const &data)
   {
-    auto len = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
     set_cuda_dim<BlockDim>(dims.blocks, len);
 
+    // since we are direct-mapping, we REQUIRE len
+    set_cuda_dim<BlockDim>(dims.min_blocks, len);
+
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
     return dims.max(enclosed_dims);
@@ -603,34 +662,48 @@ struct CudaStatementExecutor<
 
 /*
  * Executor for block work sharing inside CudaKernel.
- * Mapping directly from blockIdx.xyz to indicies
+ * Provides a grid-stride loop (stride of gridDim.xyz) for
+ * each block in xyz.
  * Assigns the loop index to offset ArgumentId
  */
 template <typename Data,
           camp::idx_t ArgumentId,
           int BlockDim,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_block_xyz_direct<BlockDim>, EnclosedStmts...>> {
+    statement::For<ArgumentId, RAJA::cuda_block_xyz_loop<BlockDim>, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-      CudaStatementListExecutor<Data, stmt_list_t>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
 
   static
   inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
-    auto i = get_cuda_dim<BlockDim>(blockIdx);
+    // grid stride loop
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = get_cuda_dim<BlockDim>(blockIdx);
+    diff_t i_stride = get_cuda_dim<BlockDim>(gridDim);
 
-    // Assign the x thread to the argument
-    data.template assign_offset<ArgumentId>(i);
+    // Iterate through grid stride of chunks
+    for(diff_t i = i_init;i < len;i += i_stride){
 
-    // execute enclosed statements
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+      // Assign the x thread to the argument
+      data.template assign_offset<ArgumentId>(i);
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, thread_active);
+    }
   }
 
 
@@ -638,15 +711,12 @@ struct CudaStatementExecutor<
   inline
   LaunchDims calculateDimensions(Data const &data)
   {
-    auto len = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
     set_cuda_dim<BlockDim>(dims.blocks, len);
 
-    // since we are direct-mapping, we REQUIRE len
-    set_cuda_dim<BlockDim>(dims.min_blocks, len);
-
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
     return dims.max(enclosed_dims);
@@ -654,6 +724,7 @@ struct CudaStatementExecutor<
 };
 
 
+
 /*
  * Executor for sequential loops inside of a CudaKernel.
  *
@@ -662,27 +733,31 @@ struct CudaStatementExecutor<
  */
 template <typename Data,
           camp::idx_t ArgumentId,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, seq_exec, EnclosedStmts...> > {
+    statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-      CudaStatementListExecutor<Data, stmt_list_t>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
   static
   inline
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
+    diff_t len = segment_length<ArgumentId>(data);
 
-    using idx_type = camp::decay<decltype(camp::get<ArgumentId>(data.offset_tuple))>;
-
-    idx_type len = segment_length<ArgumentId>(data);
-
-    for(idx_type i = 0;i < len;++ i){
+    for(diff_t i = 0;i < len;++ i){
       // Assign i to the argument
       data.template assign_offset<ArgumentId>(i);
 
@@ -707,4 +782,4 @@ struct CudaStatementExecutor<
 }  // end namespace RAJA
 
 
-#endif /* RAJA_pattern_kernel_HPP */
+#endif /* RAJA_policy_cuda_kernel_For_HPP */
diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp
index 8229cd1444..841570f55c 100644
--- a/include/RAJA/policy/cuda/kernel/ForICount.hpp
+++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp
@@ -42,27 +42,31 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           int ThreadDim,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::cuda_thread_xyz_direct<ThreadDim>, EnclosedStmts...>>
+    statement::ForICount<ArgumentId, ParamId, RAJA::cuda_thread_xyz_direct<ThreadDim>, EnclosedStmts...>,
+    Types>
     : public CudaStatementExecutor<
         Data,
-        statement::For<ArgumentId, RAJA::cuda_thread_xyz_direct<ThreadDim>, EnclosedStmts...>> {
+        statement::For<ArgumentId, RAJA::cuda_thread_xyz_direct<ThreadDim>, EnclosedStmts...>, Types> {
 
   using Base = CudaStatementExecutor<
         Data,
-        statement::For<ArgumentId, RAJA::cuda_thread_xyz_direct<ThreadDim>, EnclosedStmts...>>;
+        statement::For<ArgumentId, RAJA::cuda_thread_xyz_direct<ThreadDim>, EnclosedStmts...>,
+        Types>;
 
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
   static
   inline
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
-    auto i = get_cuda_dim<ThreadDim>(threadIdx);
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i = get_cuda_dim<ThreadDim>(threadIdx);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -87,30 +91,33 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
   Data,
   statement::ForICount<ArgumentId, ParamId, RAJA::cuda_warp_direct,
-                       EnclosedStmts ...> >
+                       EnclosedStmts ...>,
+  Types>
   : public CudaStatementExecutor<
     Data,
     statement::For<ArgumentId, RAJA::cuda_warp_direct,
-                   EnclosedStmts ...> > {
+                   EnclosedStmts ...>, Types > {
 
   using Base = CudaStatementExecutor<
           Data,
           statement::For<ArgumentId, RAJA::cuda_warp_direct,
-                         EnclosedStmts ...> >;
+                         EnclosedStmts ...>, Types >;
 
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
   static
   inline
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
-    auto i = get_cuda_dim<0>(threadIdx);
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i = get_cuda_dim<0>(threadIdx);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -132,22 +139,24 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
   Data,
   statement::ForICount<ArgumentId, ParamId, RAJA::cuda_warp_loop,
-                       EnclosedStmts ...> >
+                       EnclosedStmts ...>, Types >
   : public CudaStatementExecutor<
     Data,
     statement::For<ArgumentId, RAJA::cuda_warp_loop,
-                   EnclosedStmts ...> > {
+                   EnclosedStmts ...>, Types > {
 
   using Base = CudaStatementExecutor<
           Data,
           statement::For<ArgumentId, RAJA::cuda_warp_loop,
-                         EnclosedStmts ...> >;
+                         EnclosedStmts ...>, Types >;
 
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
   static
   inline
@@ -155,27 +164,24 @@ struct CudaStatementExecutor<
   void exec(Data &data, bool thread_active)
   {
     // block stride loop
-    int len = segment_length<ArgumentId>(data);
-    //auto i0 = threadIdx.x;
-    //auto i_stride = RAJA::policy::cuda::WARP_SIZE;
-    //auto i = i0;
-    auto &i = camp::get<ArgumentId>(data.offset_tuple);
-    i = threadIdx.x;
-    for( ; i < len; i += RAJA::policy::cuda::WARP_SIZE){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = threadIdx.x;
+    diff_t i_stride = RAJA::policy::cuda::WARP_SIZE;
+
+    // Iterate through grid stride of chunks
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
 
       // Assign the x thread to the argument
-      //  data.template assign_offset<ArgumentId>(i);
+      data.template assign_offset<ArgumentId>(i);
       data.template assign_param<ParamId>(i);
 
       // execute enclosed statements
-      enclosed_stmts_t::exec(data, thread_active);
-    }
-    // do we need one more masked iteration?
-    if(i - threadIdx.x < len){
-      // execute enclosed statements one more time, but masking them off
-      // this is because there's at least one thread that isn't masked off
-      // that is still executing the above loop
-      enclosed_stmts_t::exec(data, false);
+      enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
 };
@@ -190,26 +196,32 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
   Data,
   statement::ForICount<ArgumentId, ParamId,
                        RAJA::cuda_warp_masked_direct<Mask>,
-                       EnclosedStmts ...> >
+                       EnclosedStmts ...>, Types >
   : public CudaStatementExecutor<
     Data,
     statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
-                   EnclosedStmts ...> > {
+                   EnclosedStmts ...>, Types > {
 
   using Base = CudaStatementExecutor<
           Data,
           statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
-                         EnclosedStmts ...> >;
+                         EnclosedStmts ...>, Types >;
+
+  using typename Base::diff_t;
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t>;
+          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
@@ -221,9 +233,9 @@ struct CudaStatementExecutor<
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
 
-    auto i = mask_t::maskValue(threadIdx.x);
+    diff_t i = mask_t::maskValue(threadIdx.x);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -246,26 +258,32 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
   Data,
   statement::ForICount<ArgumentId, ParamId,
                        RAJA::cuda_warp_masked_loop<Mask>,
-                       EnclosedStmts ...> >
+                       EnclosedStmts ...>, Types >
   : public CudaStatementExecutor<
     Data,
     statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
-                   EnclosedStmts ...> > {
+                   EnclosedStmts ...>, Types > {
 
   using Base = CudaStatementExecutor<
           Data,
           statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
-                         EnclosedStmts ...> >;
+                         EnclosedStmts ...>, Types >;
+
+  using typename Base::diff_t;
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t>;
+          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
@@ -278,23 +296,24 @@ struct CudaStatementExecutor<
   void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
-    int len = segment_length<ArgumentId>(data);
-    auto i = mask_t::maskValue(threadIdx.x);
-    for( ; i < len; i += (int) mask_t::max_masked_size){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = mask_t::maskValue(threadIdx.x);
+    diff_t i_stride = (diff_t) mask_t::max_masked_size;
+
+    // Iterate through grid stride of chunks
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
 
       // Assign the x thread to the argument and param
       data.template assign_offset<ArgumentId>(i);
       data.template assign_param<ParamId>(i);
 
       // execute enclosed statements
-      enclosed_stmts_t::exec(data, thread_active);
-    }
-    // do we need one more masked iteration?
-    if(i - mask_t::maskValue(threadIdx.x) < len){
-      // execute enclosed statements one more time, but masking them off
-      // this is because there's at least one thread that isn't masked off
-      // that is still executing the above loop
-      enclosed_stmts_t::exec(data, false);
+      enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
 
@@ -312,26 +331,32 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
   Data,
   statement::ForICount<ArgumentId, ParamId,
                        RAJA::cuda_thread_masked_direct<Mask>,
-                       EnclosedStmts ...> >
+                       EnclosedStmts ...>, Types >
   : public CudaStatementExecutor<
     Data,
     statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
-                   EnclosedStmts ...> > {
+                   EnclosedStmts ...>, Types > {
 
   using Base = CudaStatementExecutor<
           Data,
           statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
-                         EnclosedStmts ...> >;
+                         EnclosedStmts ...>, Types >;
+
+  using typename Base::diff_t;
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t>;
+          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
@@ -340,9 +365,9 @@ struct CudaStatementExecutor<
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
 
-    auto i = mask_t::maskValue(threadIdx.x);
+    diff_t i = mask_t::maskValue(threadIdx.x);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -367,26 +392,32 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
   Data,
   statement::ForICount<ArgumentId, ParamId,
                        RAJA::cuda_thread_masked_loop<Mask>,
-                       EnclosedStmts ...> >
+                       EnclosedStmts ...>, Types >
   : public CudaStatementExecutor<
     Data,
     statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
-                   EnclosedStmts ...> > {
+                   EnclosedStmts ...>, Types > {
 
   using Base = CudaStatementExecutor<
           Data,
           statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
-                         EnclosedStmts ...> >;
+                         EnclosedStmts ...>, Types >;
+
+  using typename Base::diff_t;
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t>;
+          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
@@ -396,23 +427,24 @@ struct CudaStatementExecutor<
   void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
-    int len = segment_length<ArgumentId>(data);
-    int i = mask_t::maskValue(threadIdx.x);
-    for( ; i < len; i += (int) mask_t::max_masked_size){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = mask_t::maskValue(threadIdx.x);
+    diff_t i_stride = (diff_t) mask_t::max_masked_size;
+
+    // Iterate through grid stride of chunks
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
       data.template assign_param<ParamId>(i);
 
       // execute enclosed statements
-      enclosed_stmts_t::exec(data, thread_active);
-    }
-    // do we need one more masked iteration?
-    if(i - mask_t::maskValue(threadIdx.x) < len){
-      // execute enclosed statements one more time, but masking them off
-      // this is because there's at least one thread that isn't masked off
-      // that is still executing the above loop
-      enclosed_stmts_t::exec(data, false);
+      enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
 
@@ -434,50 +466,100 @@ template <typename Data,
           typename ParamId,
           int ThreadDim,
           int MinThreads,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::cuda_thread_xyz_loop<ThreadDim, MinThreads>, EnclosedStmts...>>
+    statement::ForICount<ArgumentId, ParamId, RAJA::cuda_thread_xyz_loop<ThreadDim, MinThreads>, EnclosedStmts...>,
+    Types>
     : public CudaStatementExecutor<
         Data,
-        statement::For<ArgumentId, RAJA::cuda_thread_xyz_loop<ThreadDim, MinThreads>, EnclosedStmts...>> {
+        statement::For<ArgumentId, RAJA::cuda_thread_xyz_loop<ThreadDim, MinThreads>, EnclosedStmts...>,
+        Types> {
 
   using Base = CudaStatementExecutor<
         Data,
-        statement::For<ArgumentId, RAJA::cuda_thread_xyz_loop<ThreadDim, MinThreads>, EnclosedStmts...>>;
+        statement::For<ArgumentId, RAJA::cuda_thread_xyz_loop<ThreadDim, MinThreads>, EnclosedStmts...>,
+        Types>;
 
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
   static
   inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // block stride loop
-    auto len = segment_length<ArgumentId>(data);
-    auto i0 = get_cuda_dim<ThreadDim>(threadIdx);
-    auto i_stride = get_cuda_dim<ThreadDim>(blockDim);
-    auto i = i0;
-    for(;i < len;i += i_stride){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = get_cuda_dim<ThreadDim>(threadIdx);
+    diff_t i_stride = get_cuda_dim<ThreadDim>(blockDim);
+
+    // Iterate through grid stride of chunks
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
       data.template assign_param<ParamId>(i);
 
       // execute enclosed statements
-      enclosed_stmts_t::exec(data, thread_active);
-    }
-    // do we need one more masked iteration?
-    if(i - i0 < len)
-    {
-      // execute enclosed statements one more time, but masking them off
-      // this is because there's at least one thread that isn't masked off
-      // that is still executing the above loop
-      enclosed_stmts_t::exec(data, false);
+      enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
 };
 
 
 
+/*
+ * Executor for block work sharing inside CudaKernel.
+ * Provides a direct mapping of each block in xyz.
+ * Assigns the loop index to offset ArgumentId
+ * Assigns the loop index to param ParamId
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          int BlockDim,
+          typename... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<
+    Data,
+    statement::ForICount<ArgumentId, ParamId, RAJA::cuda_block_xyz_direct<BlockDim>, EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
+        Data,
+        statement::For<ArgumentId, RAJA::cuda_block_xyz_direct<BlockDim>, EnclosedStmts...>,
+        Types> {
+
+  using Base = CudaStatementExecutor<
+      Data,
+      statement::For<ArgumentId, RAJA::cuda_block_xyz_direct<BlockDim>, EnclosedStmts...>,
+      Types>;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static
+  inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  {
+    // grid stride loop
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i = get_cuda_dim<BlockDim>(blockIdx);
+
+    if (i < len) {
+
+      // Assign the x thread to the argument
+      data.template assign_offset<ArgumentId>(i);
+      data.template assign_param<ParamId>(i);
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, thread_active);
+    }
+  }
+};
 
 /*
  * Executor for block work sharing inside CudaKernel.
@@ -490,28 +572,35 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           int BlockDim,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::cuda_block_xyz_loop<BlockDim>, EnclosedStmts...>>
+    statement::ForICount<ArgumentId, ParamId, RAJA::cuda_block_xyz_loop<BlockDim>, EnclosedStmts...>,
+    Types>
     : public CudaStatementExecutor<
         Data,
-        statement::For<ArgumentId, RAJA::cuda_block_xyz_loop<BlockDim>, EnclosedStmts...>> {
+        statement::For<ArgumentId, RAJA::cuda_block_xyz_loop<BlockDim>, EnclosedStmts...>,
+        Types> {
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::For<ArgumentId, RAJA::cuda_block_xyz_loop<BlockDim>, EnclosedStmts...>>;
+      statement::For<ArgumentId, RAJA::cuda_block_xyz_loop<BlockDim>, EnclosedStmts...>,
+      Types>;
 
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
   static
   inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // grid stride loop
-    auto len = segment_length<ArgumentId>(data);
-    auto i0 = get_cuda_dim<BlockDim>(blockIdx);
-    auto i_stride = get_cuda_dim<BlockDim>(gridDim);
-    for(auto i = i0;i < len;i += i_stride){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = get_cuda_dim<BlockDim>(blockIdx);
+    diff_t i_stride = get_cuda_dim<BlockDim>(gridDim);
+
+    // Iterate through grid stride of chunks
+    for (diff_t i = i_init; i < len; i += i_stride) {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -534,30 +623,30 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...> >
+    statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>, Types >
     : public CudaStatementExecutor<
         Data,
-        statement::For<ArgumentId, seq_exec, EnclosedStmts...> > {
+        statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types > {
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::For<ArgumentId, seq_exec, EnclosedStmts...> >;
+      statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types >;
 
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
   static
   inline
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
-    using idx_type = camp::decay<decltype(camp::get<ArgumentId>(data.offset_tuple))>;
-
-    idx_type len = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
 
-    for(idx_type i = 0;i < len;++ i){
+    for(diff_t i = 0;i < len;++ i){
       // Assign i to the argument
       data.template assign_offset<ArgumentId>(i);
       data.template assign_param<ParamId>(i);
@@ -576,4 +665,4 @@ struct CudaStatementExecutor<
 }  // end namespace RAJA
 
 
-#endif /* RAJA_pattern_kernel_HPP */
+#endif /* RAJA_policy_cuda_kernel_ForICount_HPP */
diff --git a/include/RAJA/policy/cuda/kernel/Hyperplane.hpp b/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
index f869e937f0..4e5bbeab53 100644
--- a/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
+++ b/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
@@ -39,15 +39,21 @@ namespace internal
 template <typename Data,
           camp::idx_t HpArgumentId,
           camp::idx_t... Args,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<Data,
                              statement::Hyperplane<HpArgumentId,
                                                    seq_exec,
                                                    ArgList<Args...>,
-                                                   EnclosedStmts...>> {
+                                                   EnclosedStmts...>,
+                             Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
-  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t>;
+
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, HpArgumentId, Data>;
+
+  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   static
   inline
@@ -57,10 +63,10 @@ struct CudaStatementExecutor<Data,
     // compute Manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
     int hp_len = segment_length<HpArgumentId>(data) +
-                 VarOps::foldl(RAJA::operators::plus<int>(),
+                 foldl(RAJA::operators::plus<int>(),
                                segment_length<Args>(data)...);
 
-    int h_args = VarOps::foldl(RAJA::operators::plus<idx_t>(),
+    int h_args = foldl(RAJA::operators::plus<idx_t>(),
         camp::get<Args>(data.offset_tuple)...);
 
     // get length of i dimension
diff --git a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
index e5c649a1a5..c570cf8d7d 100644
--- a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
@@ -40,12 +40,15 @@ namespace internal
 {
 
 //Intialize thread shared array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts>
-struct CudaStatementExecutor<Data, statement::InitLocalMem<RAJA::cuda_shared_mem, camp::idx_seq<Indices...>, EnclosedStmts...>>
+template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::InitLocalMem<RAJA::cuda_shared_mem,
+                             camp::idx_seq<Indices...>, EnclosedStmts...>,
+                             Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
-  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t>;
+  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
   //Launch loops
@@ -129,12 +132,12 @@ struct CudaStatementExecutor<Data, statement::InitLocalMem<RAJA::cuda_shared_mem
 };
 
 //Intialize thread private array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts>
-struct CudaStatementExecutor<Data, statement::InitLocalMem<RAJA::cuda_thread_mem, camp::idx_seq<Indices...>, EnclosedStmts...>>
+template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+struct CudaStatementExecutor<Data, statement::InitLocalMem<RAJA::cuda_thread_mem, camp::idx_seq<Indices...>, EnclosedStmts...>, Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
-  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t>;
+  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
   //Launch loops
diff --git a/include/RAJA/policy/cuda/kernel/Lambda.hpp b/include/RAJA/policy/cuda/kernel/Lambda.hpp
index 94db7deca9..5e39e73a01 100644
--- a/include/RAJA/policy/cuda/kernel/Lambda.hpp
+++ b/include/RAJA/policy/cuda/kernel/Lambda.hpp
@@ -40,42 +40,17 @@ namespace RAJA
 namespace internal
 {
 
-template <typename Data, camp::idx_t LambdaIndex>
-struct CudaStatementExecutor<Data, statement::Lambda<LambdaIndex>> {
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, bool thread_active)
-  {
-    // Only execute the lambda if it hasn't been masked off
-    if(thread_active){
-      invoke_lambda<LambdaIndex>(data);
-    }
-  }
-
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
-  {
-    return LaunchDims();
-  }
-};
-
-//
 
-template <typename Data, camp::idx_t LambdaIndex, typename... Args>
-struct CudaStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>> {
+template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
+struct CudaStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
 
   static
   inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
-
-    //Convert SegList, ParamList into Seg, Param types, and store in a list
-    using targList = typename camp::flatten<camp::list<Args...>>::type;
-
     // Only execute the lambda if it hasn't been masked off
     if(thread_active){
-      invoke_lambda_with_args<LambdaIndex, targList>(data);
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
     }
 
   }
diff --git a/include/RAJA/policy/cuda/kernel/Reduce.hpp b/include/RAJA/policy/cuda/kernel/Reduce.hpp
index 5d57cb391a..db91f384c2 100644
--- a/include/RAJA/policy/cuda/kernel/Reduce.hpp
+++ b/include/RAJA/policy/cuda/kernel/Reduce.hpp
@@ -36,16 +36,18 @@ namespace internal
 template <typename Data,
           template <typename...> class ReduceOperator,
           typename ParamId,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<Data,
                              statement::Reduce<RAJA::cuda_block_reduce,
                                                ReduceOperator,
                                                ParamId,
-                                               EnclosedStmts...>> {
+                                               EnclosedStmts...>,
+                                               Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
-  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t>;
+  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
   static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
@@ -93,16 +95,18 @@ struct CudaStatementExecutor<Data,
 template <typename Data,
           template <typename...> class ReduceOperator,
           typename ParamId,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<Data,
                              statement::Reduce<RAJA::cuda_warp_reduce,
                                                ReduceOperator,
                                                ParamId,
-                                               EnclosedStmts...>> {
+                                               EnclosedStmts...>,
+                                               Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
-  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t>;
+  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
   static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
diff --git a/include/RAJA/policy/cuda/kernel/Sync.hpp b/include/RAJA/policy/cuda/kernel/Sync.hpp
index eb5d67717a..0e784c9f50 100644
--- a/include/RAJA/policy/cuda/kernel/Sync.hpp
+++ b/include/RAJA/policy/cuda/kernel/Sync.hpp
@@ -57,8 +57,8 @@ struct CudaSyncWarp : public internal::Statement<camp::nil> {
 namespace internal
 {
 
-template <typename Data>
-struct CudaStatementExecutor<Data, statement::CudaSyncThreads> {
+template <typename Data, typename Types>
+struct CudaStatementExecutor<Data, statement::CudaSyncThreads, Types> {
 
   static
   inline
@@ -74,8 +74,8 @@ struct CudaStatementExecutor<Data, statement::CudaSyncThreads> {
   }
 };
 
-template <typename Data>
-struct CudaStatementExecutor<Data, statement::CudaSyncWarp> {
+template <typename Data, typename Types>
+struct CudaStatementExecutor<Data, statement::CudaSyncWarp, Types> {
 
   static
   inline
diff --git a/include/RAJA/policy/cuda/kernel/Tile.hpp b/include/RAJA/policy/cuda/kernel/Tile.hpp
index ad1c0aecb6..5bfac34882 100644
--- a/include/RAJA/policy/cuda/kernel/Tile.hpp
+++ b/include/RAJA/policy/cuda/kernel/Tile.hpp
@@ -1,4 +1,4 @@
-/*!
+ /*!
  ******************************************************************************
  *
  * \file
@@ -8,6 +8,7 @@
  ******************************************************************************
  */
 
+
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 // Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
 // and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
@@ -15,6 +16,7 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+
 #ifndef RAJA_policy_cuda_kernel_Tile_HPP
 #define RAJA_policy_cuda_kernel_Tile_HPP
 
@@ -48,14 +50,16 @@ namespace internal
 template <typename Data,
           camp::idx_t ArgumentId,
           typename TPol,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>>
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
-  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t>;
+  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
   static
   inline
@@ -68,13 +72,13 @@ struct CudaStatementExecutor<
     using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
-    int chunk_size = TPol::chunk_size;
+    diff_t chunk_size = TPol::chunk_size;
 
     // compute trip count
-    int len = segment.end() - segment.begin();
+    diff_t len = segment.end() - segment.begin();
 
     // Iterate through tiles
-    for (int i = 0; i < len; i += chunk_size) {
+    for (diff_t i = 0; i < len; i += chunk_size) {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -121,18 +125,116 @@ template <typename Data,
           camp::idx_t ArgumentId,
           camp::idx_t chunk_size,
           int BlockDim,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
-                    RAJA::statement::tile_fixed<chunk_size>,
+                    RAJA::tile_fixed<chunk_size>,
+                    cuda_block_xyz_direct<BlockDim>,
+                    EnclosedStmts...>,
+                    Types>
+  {
+
+  using stmt_list_t = StatementList<EnclosedStmts...>;
+
+  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
+
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
+  {
+    // Get the segment referenced by this Tile statement
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+
+    using segment_t = camp::decay<decltype(segment)>;
+
+    // compute trip count
+    diff_t len = segment.end() - segment.begin();
+    diff_t i = get_cuda_dim<BlockDim>(blockIdx) * chunk_size;
+
+    // check have chunk
+    if (i < len) {
+
+      // Keep copy of original segment, so we can restore it
+      segment_t orig_segment = segment;
+
+      // Assign our new tiled segment
+      segment = orig_segment.slice(i, chunk_size);
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, thread_active);
+
+      // Set range back to original values
+      segment = orig_segment;
+    }
+  }
+
+
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
+  {
+
+    // Compute how many blocks
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t num_blocks = len / chunk_size;
+    if (num_blocks * chunk_size < len) {
+      num_blocks++;
+    }
+
+    LaunchDims dims;
+    set_cuda_dim<BlockDim>(dims.blocks, num_blocks);
+
+    // since we are direct-mapping, we REQUIRE len
+    set_cuda_dim<BlockDim>(dims.min_blocks, num_blocks);
+
+
+    // privatize data, so we can mess with the segments
+    using data_t = camp::decay<Data>;
+    data_t private_data = data;
+
+    // Get original segment
+    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+
+    // restrict to first tile
+    segment = segment.slice(0, chunk_size);
+
+
+    LaunchDims enclosed_dims =
+        enclosed_stmts_t::calculateDimensions(private_data);
+
+    return dims.max(enclosed_dims);
+  }
+};
+
+/*!
+ * A specialized RAJA::kernel cuda_impl executor for statement::Tile
+ * Assigns the tile segment to segment ArgumentId
+ *
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          camp::idx_t chunk_size,
+          int BlockDim,
+          typename... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<
+    Data,
+    statement::Tile<ArgumentId,
+                    RAJA::tile_fixed<chunk_size>,
                     cuda_block_xyz_loop<BlockDim>,
-                    EnclosedStmts...>>
+                    EnclosedStmts...>, Types>
   {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
-  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t>;
+  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
   static
   inline
@@ -147,12 +249,12 @@ struct CudaStatementExecutor<
     segment_t orig_segment = segment;
 
     // compute trip count
-    auto len = segment.end() - segment.begin();
-    auto i0 = get_cuda_dim<BlockDim>(blockIdx) * chunk_size;
-    auto i_stride = get_cuda_dim<BlockDim>(gridDim) * chunk_size;
+    diff_t len = segment.end() - segment.begin();
+    diff_t i_init = get_cuda_dim<BlockDim>(blockIdx) * chunk_size;
+    diff_t i_stride = get_cuda_dim<BlockDim>(gridDim) * chunk_size;
 
     // Iterate through grid stride of chunks
-    for (int i = i0; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride) {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -172,8 +274,8 @@ struct CudaStatementExecutor<
   {
 
     // Compute how many blocks
-    int len = segment_length<ArgumentId>(data);
-    int num_blocks = len / chunk_size;
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t num_blocks = len / chunk_size;
     if (num_blocks * chunk_size < len) {
       num_blocks++;
     }
@@ -212,17 +314,20 @@ template <typename Data,
           camp::idx_t ArgumentId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
   Data,
   statement::Tile<ArgumentId,
-                  RAJA::statement::tile_fixed<chunk_size>,
+                  RAJA::tile_fixed<chunk_size>,
                   cuda_thread_xyz_direct<ThreadDim>,
-                  EnclosedStmts ...> >{
+                  EnclosedStmts ...>, Types>{
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
-  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t>;
+  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
   static
   inline
@@ -237,13 +342,19 @@ struct CudaStatementExecutor<
     segment_t orig_segment = segment;
 
     // compute trip count
-    auto i0 = get_cuda_dim<ThreadDim>(threadIdx) * chunk_size;
+    diff_t len = segment.end() - segment.begin();
+    diff_t i = get_cuda_dim<ThreadDim>(threadIdx) * chunk_size;
+
+    // execute enclosed statements if any thread will
+    // but mask off threads without work
+    bool have_work = i < len;
 
     // Assign our new tiled segment
-    segment = orig_segment.slice(i0, chunk_size);
+    diff_t slice_size = have_work ? chunk_size : 0;
+    segment = orig_segment.slice(i, slice_size);
 
     // execute enclosed statements
-    enclosed_stmts_t::exec(data, thread_active);
+    enclosed_stmts_t::exec(data, thread_active && have_work);
 
     // Set range back to original values
     segment = orig_segment;
@@ -256,15 +367,15 @@ struct CudaStatementExecutor<
   {
 
     // Compute how many blocks
-    int len = segment_length<ArgumentId>(data);
-    int num_threads = len / chunk_size;
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t num_threads = len / chunk_size;
     if(num_threads * chunk_size < len){
       num_threads++;
     }
 
     LaunchDims dims;
     set_cuda_dim<ThreadDim>(dims.threads, num_threads);
-
+    set_cuda_dim<ThreadDim>(dims.min_threads, num_threads);
 
     // privatize data, so we can mess with the segments
     using data_t = camp::decay<Data>;
@@ -295,17 +406,20 @@ template <typename Data,
           camp::idx_t chunk_size,
           int ThreadDim,
           int MinThreads,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
   Data,
   statement::Tile<ArgumentId,
-                  RAJA::statement::tile_fixed<chunk_size>,
+                  RAJA::tile_fixed<chunk_size>,
                   cuda_thread_xyz_loop<ThreadDim, MinThreads>,
-                  EnclosedStmts ...> >{
+                  EnclosedStmts ...>, Types>{
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
-  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t>;
+  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
   static
   inline
@@ -320,20 +434,24 @@ struct CudaStatementExecutor<
     segment_t orig_segment = segment;
 
     // compute trip count
-    auto i0 = get_cuda_dim<ThreadDim>(threadIdx) * chunk_size;
-
-    // Get our stride from the dimension
-    auto i_stride = get_cuda_dim<ThreadDim>(blockDim) * chunk_size;
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = get_cuda_dim<ThreadDim>(threadIdx) * chunk_size;
+    diff_t i_stride = get_cuda_dim<ThreadDim>(blockDim) * chunk_size;
 
     // Iterate through grid stride of chunks
-    int len = segment_length<ArgumentId>(data);
-    for (int i = i0; i < len; i += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
 
       // Assign our new tiled segment
-      segment = orig_segment.slice(i, chunk_size);
+      diff_t slice_size = have_work ? chunk_size : 0;
+      segment = orig_segment.slice(i, slice_size);
 
       // execute enclosed statements
-      enclosed_stmts_t::exec(data, thread_active);
+      enclosed_stmts_t::exec(data, thread_active && have_work);
     }
 
     // Set range back to original values
@@ -347,12 +465,12 @@ struct CudaStatementExecutor<
   {
 
     // Compute how many blocks
-    int len = segment_length<ArgumentId>(data);
-    int num_threads = len / chunk_size;
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t num_threads = len / chunk_size;
     if(num_threads * chunk_size < len){
       num_threads++;
     }
-    num_threads = std::max(num_threads, MinThreads);
+    num_threads = std::max(num_threads, (diff_t)MinThreads);
 
     LaunchDims dims;
     set_cuda_dim<ThreadDim>(dims.threads, num_threads);
@@ -383,4 +501,4 @@ struct CudaStatementExecutor<
 }  // end namespace RAJA
 
 #endif  // RAJA_ENABLE_CUDA
-#endif  /* RAJA_pattern_kernel_HPP */
+#endif  /* RAJA_policy_cuda_kernel_Tile_HPP */
diff --git a/include/RAJA/policy/cuda/kernel/TileTCount.hpp b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
index 01f219eb2d..7e0c7df107 100644
--- a/include/RAJA/policy/cuda/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
@@ -8,6 +8,7 @@
  ******************************************************************************
  */
 
+
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 // Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
 // and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
@@ -15,6 +16,7 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+
 #ifndef RAJA_policy_cuda_kernel_TileTCount_HPP
 #define RAJA_policy_cuda_kernel_TileTCount_HPP
 
@@ -49,19 +51,21 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename TPol,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>>
+    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
     : public CudaStatementExecutor<
         Data,
-        statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>> {
+        statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types> {
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>>;
+      statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>;
 
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
   static
   inline
@@ -74,13 +78,13 @@ struct CudaStatementExecutor<
     using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
-    int chunk_size = TPol::chunk_size;
+    diff_t chunk_size = TPol::chunk_size;
 
     // compute trip count
-    int len = segment.end() - segment.begin();
+    diff_t len = segment.end() - segment.begin();
 
     // Iterate through tiles
-    for (int i = 0, t = 0; i < len; i += chunk_size, ++t) {
+    for (diff_t i = 0, t = 0; i < len; i += chunk_size, ++t) {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -106,28 +110,105 @@ template <typename Data,
           typename ParamId,
           camp::idx_t chunk_size,
           int BlockDim,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<
+    Data,
+    statement::TileTCount<ArgumentId, ParamId,
+                    RAJA::tile_fixed<chunk_size>,
+                    cuda_block_xyz_direct<BlockDim>,
+                    EnclosedStmts...>,
+                    Types>
+    : public CudaStatementExecutor<
+        Data,
+        statement::Tile<ArgumentId,
+                        RAJA::tile_fixed<chunk_size>,
+                        cuda_block_xyz_direct<BlockDim>,
+                        EnclosedStmts...>,
+                        Types> {
+
+  using Base = CudaStatementExecutor<
+      Data,
+      statement::Tile<ArgumentId,
+                      RAJA::tile_fixed<chunk_size>,
+                      cuda_block_xyz_direct<BlockDim>,
+                      EnclosedStmts...>,
+                      Types>;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
+  {
+    // Get the segment referenced by this Tile statement
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+
+    using segment_t = camp::decay<decltype(segment)>;
+
+    // compute trip count
+    diff_t len = segment.end() - segment.begin();
+    diff_t t = get_cuda_dim<BlockDim>(blockIdx);
+    diff_t i = t * chunk_size;
+
+    // check have a chunk
+    if (i < len) {
+
+      // Keep copy of original segment, so we can restore it
+      segment_t orig_segment = segment;
+
+      // Assign our new tiled segment
+      segment = orig_segment.slice(i, chunk_size);
+      data.template assign_param<ParamId>(t);
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, thread_active);
+
+      // Set range back to original values
+      segment = orig_segment;
+    }
+  }
+};
+
+/*!
+ * A specialized RAJA::kernel cuda_impl executor for statement::TileTCount
+ * Assigns the tile segment to segment ArgumentId
+ * Assigns the tile index to param ParamId
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          int BlockDim,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::statement::tile_fixed<chunk_size>,
+                    RAJA::tile_fixed<chunk_size>,
                     cuda_block_xyz_loop<BlockDim>,
-                    EnclosedStmts...>>
+                    EnclosedStmts...>,
+                    Types>
     : public CudaStatementExecutor<
         Data,
         statement::Tile<ArgumentId,
-                        RAJA::statement::tile_fixed<chunk_size>,
+                        RAJA::tile_fixed<chunk_size>,
                         cuda_block_xyz_loop<BlockDim>,
-                        EnclosedStmts...>> {
+                        EnclosedStmts...>,
+                        Types> {
 
   using Base = CudaStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
-                      RAJA::statement::tile_fixed<chunk_size>,
+                      RAJA::tile_fixed<chunk_size>,
                       cuda_block_xyz_loop<BlockDim>,
-                      EnclosedStmts...>>;
+                      EnclosedStmts...>,
+                      Types>;
 
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
   static
   inline
@@ -142,14 +223,14 @@ struct CudaStatementExecutor<
     segment_t orig_segment = segment;
 
     // compute trip count
-    int len = segment.end() - segment.begin();
-    auto t0 = get_cuda_dim<BlockDim>(blockIdx);
-    auto t_stride = get_cuda_dim<BlockDim>(gridDim);
-    auto i0 = t0 * chunk_size;
-    auto i_stride = t_stride * chunk_size;
+    diff_t len = segment.end() - segment.begin();
+    diff_t t_init = get_cuda_dim<BlockDim>(blockIdx);
+    diff_t i_init = t_init * chunk_size;
+    diff_t t_stride = get_cuda_dim<BlockDim>(gridDim);
+    diff_t i_stride = t_stride * chunk_size;
 
     // Iterate through grid stride of chunks
-    for (int i = i0, t = t0; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -176,28 +257,33 @@ template <typename Data,
           typename ParamId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
   Data,
   statement::TileTCount<ArgumentId, ParamId,
-                        RAJA::statement::tile_fixed<chunk_size>,
+                        RAJA::tile_fixed<chunk_size>,
                         cuda_thread_xyz_direct<ThreadDim>,
-                        EnclosedStmts ...> >
+                        EnclosedStmts ...>,
+                        Types>
   : public CudaStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
-                    RAJA::statement::tile_fixed<chunk_size>,
+                    RAJA::tile_fixed<chunk_size>,
                     cuda_thread_xyz_direct<ThreadDim>,
-                    EnclosedStmts ...> > {
+                    EnclosedStmts ...>,
+                    Types> {
 
   using Base = CudaStatementExecutor<
           Data,
           statement::Tile<ArgumentId,
-                          RAJA::statement::tile_fixed<chunk_size>,
+                          RAJA::tile_fixed<chunk_size>,
                           cuda_thread_xyz_direct<ThreadDim>,
-                          EnclosedStmts ...> >;
+                          EnclosedStmts ...>,
+                          Types>;
 
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
   static
   inline
@@ -212,17 +298,102 @@ struct CudaStatementExecutor<
     segment_t orig_segment = segment;
 
     // compute trip count
-    int len = segment.end() - segment.begin();
-    auto t0 = get_cuda_dim<ThreadDim>(threadIdx);
-    auto t_stride = get_cuda_dim<ThreadDim>(blockDim);
-    auto i0 = t0 * chunk_size;
+    diff_t len = segment.end() - segment.begin();
+    diff_t t = get_cuda_dim<ThreadDim>(threadIdx);
+    diff_t i = t * chunk_size;
+
+    // execute enclosed statements if any thread will
+    // but mask off threads without work
+    bool have_work = i < len;
 
     // Assign our new tiled segment
-    segment = orig_segment.slice(i0, chunk_size);
-    data.template assign_param<ParamId>(t0);
+    diff_t slice_size = have_work ? chunk_size : 0;
+    segment = orig_segment.slice(i, slice_size);
+    data.template assign_param<ParamId>(t);
 
     // execute enclosed statements
-    enclosed_stmts_t::exec(data, thread_active);
+    enclosed_stmts_t::exec(data, thread_active && have_work);
+
+    // Set range back to original values
+    segment = orig_segment;
+  }
+};
+
+
+/*!
+ * A specialized RAJA::kernel cuda_impl executor for statement::TileTCount
+ * Assigns the tile segment to segment ArgumentId
+ * Assigns the tile index to param ParamId
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          int ThreadDim,
+          int MinThreads,
+          typename ... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<
+  Data,
+  statement::TileTCount<ArgumentId, ParamId,
+                        RAJA::tile_fixed<chunk_size>,
+                        cuda_thread_xyz_loop<ThreadDim, MinThreads>,
+                        EnclosedStmts ...>,
+                        Types>
+  : public CudaStatementExecutor<
+    Data,
+    statement::Tile<ArgumentId,
+                    RAJA::tile_fixed<chunk_size>,
+                    cuda_thread_xyz_loop<ThreadDim, MinThreads>,
+                    EnclosedStmts ...>,
+                    Types> {
+
+  using Base = CudaStatementExecutor<
+          Data,
+          statement::Tile<ArgumentId,
+                          RAJA::tile_fixed<chunk_size>,
+                          cuda_thread_xyz_loop<ThreadDim, MinThreads>,
+                          EnclosedStmts ...>,
+                          Types>;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
+  {
+    // Get the segment referenced by this Tile statement
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+
+    // Keep copy of original segment, so we can restore it
+    using segment_t = camp::decay<decltype(segment)>;
+    segment_t orig_segment = segment;
+
+    // compute trip count
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t t_init = get_cuda_dim<ThreadDim>(threadIdx);
+    diff_t i_init = t_init * chunk_size;
+    diff_t t_stride = get_cuda_dim<ThreadDim>(blockDim);
+    diff_t i_stride = t_stride * chunk_size;
+
+    // Iterate through grid stride of chunks
+    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
+
+      // Assign our new tiled segment
+      diff_t slice_size = have_work ? chunk_size : 0;
+      segment = orig_segment.slice(i, slice_size);
+      data.template assign_param<ParamId>(t);
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, thread_active && have_work);
+    }
 
     // Set range back to original values
     segment = orig_segment;
@@ -233,4 +404,4 @@ struct CudaStatementExecutor<
 }  // end namespace RAJA
 
 #endif  // RAJA_ENABLE_CUDA
-#endif  /* RAJA_pattern_kernel_HPP */
+#endif  /* RAJA_policy_cuda_kernel_TileTCount_HPP */
diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp
index 39be774791..5e35da284f 100644
--- a/include/RAJA/policy/cuda/kernel/internal.hpp
+++ b/include/RAJA/policy/cuda/kernel/internal.hpp
@@ -36,67 +36,11 @@
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 #include "RAJA/policy/cuda/policy.hpp"
 
-#include "RAJA/internal/LegacyCompatibility.hpp"
-
 
 namespace RAJA
 {
 
 
-/*!
- * Policy for For<>, executes loop iteration by distributing them over threads.
- * This does no (additional) work-sharing between thread blocks.
- */
-
-struct cuda_thread_exec : public RAJA::make_policy_pattern_launch_platform_t<
-                              RAJA::Policy::cuda,
-                              RAJA::Pattern::forall,
-                              RAJA::Launch::undefined,
-                              RAJA::Platform::cuda> {
-};
-
-
-/*!
- * Policy for For<>, executes loop iteration by distributing iterations
- * exclusively over blocks.
- */
-
-struct cuda_block_exec : public RAJA::make_policy_pattern_launch_platform_t<
-                             RAJA::Policy::cuda,
-                             RAJA::Pattern::forall,
-                             RAJA::Launch::undefined,
-                             RAJA::Platform::cuda> {
-};
-
-
-/*!
- * Policy for For<>, executes loop iteration by distributing work over
- * physical blocks and executing sequentially within blocks.
- */
-
-template <size_t num_blocks>
-struct cuda_block_seq_exec : public RAJA::make_policy_pattern_launch_platform_t<
-                                 RAJA::Policy::cuda,
-                                 RAJA::Pattern::forall,
-                                 RAJA::Launch::undefined,
-                                 RAJA::Platform::cuda> {
-};
-
-
-/*!
- * Policy for For<>, executes loop iteration by distributing them over threads
- * and blocks, but limiting the number of threads to num_threads.
- */
-template <size_t num_threads>
-struct cuda_threadblock_exec
-    : public RAJA::make_policy_pattern_launch_platform_t<
-          RAJA::Policy::cuda,
-          RAJA::Pattern::forall,
-          RAJA::Launch::undefined,
-          RAJA::Platform::cuda> {
-};
-
-
 namespace internal
 {
 
@@ -365,18 +309,18 @@ struct CudaStatementListExecutorHelper<num_stmts, num_stmts, StmtList> {
 };
 
 
-template <typename Data, typename Policy>
+template <typename Data, typename Policy, typename Types>
 struct CudaStatementExecutor;
 
-template <typename Data, typename StmtList>
+template <typename Data, typename StmtList, typename Types>
 struct CudaStatementListExecutor;
 
 
-template <typename Data, typename... Stmts>
-struct CudaStatementListExecutor<Data, StatementList<Stmts...>> {
+template <typename Data, typename... Stmts, typename Types>
+struct CudaStatementListExecutor<Data, StatementList<Stmts...>, Types> {
 
   using enclosed_stmts_t =
-      camp::list<CudaStatementExecutor<Data, Stmts>...>;
+      camp::list<CudaStatementExecutor<Data, Stmts, Types>...>;
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
@@ -402,10 +346,11 @@ struct CudaStatementListExecutor<Data, StatementList<Stmts...>> {
 };
 
 
-template <typename StmtList, typename Data>
+template <typename StmtList, typename Data, typename Types>
 using cuda_statement_list_executor_t = CudaStatementListExecutor<
     Data,
-    StmtList>;
+    StmtList,
+    Types>;
 
 
 
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 909900ec50..3433fc3fb6 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -87,6 +87,24 @@ struct cuda_exec : public RAJA::make_policy_pattern_launch_platform_t<
 // NOTE: There is no Index set segment iteration policy for CUDA
 //
 
+///
+/// WorkGroup execution policies
+///
+template <size_t BLOCK_SIZE, bool Async = false>
+struct cuda_work : public RAJA::make_policy_pattern_launch_platform_t<
+                       RAJA::Policy::cuda,
+                       RAJA::Pattern::workgroup_exec,
+                       detail::get_launch<Async>::value,
+                       RAJA::Platform::cuda> {
+};
+
+struct unordered_cuda_loop_y_block_iter_x_threadblock_average
+    : public RAJA::make_policy_pattern_platform_t<
+                       RAJA::Policy::cuda,
+                       RAJA::Pattern::workgroup_order,
+                       RAJA::Platform::cuda> {
+};
+
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
@@ -183,6 +201,13 @@ using policy::cuda::cuda_exec;
 template <size_t BLOCK_SIZE>
 using cuda_exec_async = policy::cuda::cuda_exec<BLOCK_SIZE, true>;
 
+using policy::cuda::cuda_work;
+
+template <size_t BLOCK_SIZE>
+using cuda_work_async = policy::cuda::cuda_work<BLOCK_SIZE, true>;
+
+using policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
+
 using policy::cuda::cuda_reduce_base;
 using policy::cuda::cuda_reduce;
 using policy::cuda::cuda_reduce_atomic;
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 98b58538ff..56c97c0879 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -29,6 +29,7 @@
 
 #include <cuda.h>
 
+#include "RAJA/util/macros.hpp"
 #include "RAJA/util/SoAArray.hpp"
 #include "RAJA/util/SoAPtr.hpp"
 #include "RAJA/util/basic_mempool.hpp"
@@ -139,7 +140,7 @@ union AsIntegerArray {
   static_assert(sizeof(integer_type) <= max_integer_type_size,
                 "integer_type greater than max integer type size");
 
-  constexpr static size_t num_integer_type =
+  static constexpr size_t num_integer_type =
       (sizeof(T) + sizeof(integer_type) - 1) / sizeof(integer_type);
 
   T value;
@@ -938,7 +939,7 @@ class Reduce
   //  reducer in host device lambda not being used on device.
   RAJA_HOST_DEVICE
   Reduce(const Reduce& other)
-#if !defined(__CUDA_ARCH__)
+#if !defined(RAJA_DEVICE_CODE)
       : parent{other.parent},
 #else
       : parent{&other},
@@ -946,7 +947,7 @@ class Reduce
         tally_or_val_ptr{other.tally_or_val_ptr},
         val(other.val)
   {
-#if !defined(__CUDA_ARCH__)
+#if !defined(RAJA_DEVICE_CODE)
     if (parent) {
       if (val.setupForDevice()) {
         tally_or_val_ptr.val_ptr =
@@ -963,7 +964,7 @@ class Reduce
   RAJA_HOST_DEVICE
   ~Reduce()
   {
-#if !defined(__CUDA_ARCH__)
+#if !defined(RAJA_DEVICE_CODE)
     if (parent == this) {
       delete tally_or_val_ptr.list;
       tally_or_val_ptr.list = nullptr;
@@ -1060,6 +1061,42 @@ class ReduceSum<cuda_reduce_base<maybe_atomic>, T>
   }
 };
 
+//! specialization of ReduceBitOr for cuda_reduce
+template <bool maybe_atomic, typename T>
+class ReduceBitOr<cuda_reduce_base<maybe_atomic>, T>
+    : public cuda::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic>
+{
+
+public:
+  using Base = cuda::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic>;
+  using Base::Base;
+  //! enable operator|= for ReduceBitOr -- alias for combine()
+  RAJA_HOST_DEVICE
+  const ReduceBitOr& operator|=(T rhs) const
+  {
+    this->combine(rhs);
+    return *this;
+  }
+};
+
+//! specialization of ReduceBitAnd for cuda_reduce
+template <bool maybe_atomic, typename T>
+class ReduceBitAnd<cuda_reduce_base<maybe_atomic>, T>
+    : public cuda::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic>
+{
+
+public:
+  using Base = cuda::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic>;
+  using Base::Base;
+  //! enable operator&= for ReduceBitAnd -- alias for combine()
+  RAJA_HOST_DEVICE
+  const ReduceBitAnd& operator&=(T rhs) const
+  {
+    this->combine(rhs);
+    return *this;
+  }
+};
+
 //! specialization of ReduceMin for cuda_reduce
 template <bool maybe_atomic, typename T>
 class ReduceMin<cuda_reduce_base<maybe_atomic>, T>
diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp
new file mode 100644
index 0000000000..2b159a9bd9
--- /dev/null
+++ b/include/RAJA/policy/cuda/sort.hpp
@@ -0,0 +1,515 @@
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file providing RAJA sort declarations.
+*
+******************************************************************************
+*/
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_sort_cuda_HPP
+#define RAJA_sort_cuda_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include <climits>
+#include <iterator>
+#include <type_traits>
+
+#include "cub/device/device_radix_sort.cuh"
+
+#include "RAJA/util/concepts.hpp"
+#include "RAJA/util/Operators.hpp"
+#include "RAJA/pattern/detail/algorithm.hpp"
+#include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
+#include "RAJA/policy/cuda/policy.hpp"
+
+namespace RAJA
+{
+namespace impl
+{
+namespace sort
+{
+
+/*!
+        \brief static assert unimplemented stable sort
+*/
+template <size_t BLOCK_SIZE, bool Async, typename Iter, typename Compare>
+concepts::enable_if<concepts::negate<concepts::all_of<
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                      std::is_pointer<Iter>,
+                      concepts::any_of<
+                        camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
+                        camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+stable(const ::RAJA::cuda_exec<BLOCK_SIZE, Async>&,
+       Iter,
+       Iter,
+       Compare)
+{
+  static_assert (std::is_pointer<Iter>::value,
+      "stable_sort<cuda_exec> is only implemented for pointers");
+  using iterval = RAJA::detail::IterVal<Iter>;
+  static_assert (type_traits::is_arithmetic<iterval>::value,
+      "stable_sort<cuda_exec> is only implemented for arithmetic types");
+  static_assert (concepts::any_of<
+      camp::is_same<Compare, operators::less<iterval>>,
+      camp::is_same<Compare, operators::greater<iterval>>>::value,
+      "stable_sort<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+}
+
+/*!
+        \brief stable sort given range in ascending order
+*/
+template <size_t BLOCK_SIZE, bool Async, typename Iter>
+concepts::enable_if<type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                    std::is_pointer<Iter>>
+stable(const ::RAJA::cuda_exec<BLOCK_SIZE, Async>&,
+       Iter begin,
+       Iter end,
+       operators::less<RAJA::detail::IterVal<Iter>>)
+{
+  cudaStream_t stream = 0;
+
+  using R = RAJA::detail::IterVal<Iter>;
+
+  int len = std::distance(begin, end);
+  int begin_bit=0;
+  int end_bit=sizeof(R)*CHAR_BIT;
+
+  // Allocate temporary storage for the output array
+  R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
+
+  // use cub double buffer to reduce temporary memory requirements
+  // by allowing cub to write to the begin buffer
+  cub::DoubleBuffer<R> d_keys(begin, d_out);
+
+  // Determine temporary device storage requirements
+  void* d_temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+  cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                              temp_storage_bytes,
+                                              d_keys,
+                                              len,
+                                              begin_bit,
+                                              end_bit,
+                                              stream));
+  // Allocate temporary storage
+  d_temp_storage =
+      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
+          temp_storage_bytes);
+
+  // Run
+  cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                              temp_storage_bytes,
+                                              d_keys,
+                                              len,
+                                              begin_bit,
+                                              end_bit,
+                                              stream));
+  // Free temporary storage
+  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+
+  if (d_keys.Current() == d_out) {
+
+    // copy
+    cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream));
+  }
+
+  cuda::device_mempool_type::getInstance().free(d_out);
+
+  cuda::launch(stream);
+  if (!Async) cuda::synchronize(stream);
+}
+
+/*!
+        \brief stable sort given range in descending order
+*/
+template <size_t BLOCK_SIZE, bool Async, typename Iter>
+concepts::enable_if<type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                    std::is_pointer<Iter>>
+stable(const ::RAJA::cuda_exec<BLOCK_SIZE, Async>&,
+       Iter begin,
+       Iter end,
+       operators::greater<RAJA::detail::IterVal<Iter>>)
+{
+  cudaStream_t stream = 0;
+
+  using R = RAJA::detail::IterVal<Iter>;
+
+  int len = std::distance(begin, end);
+  int begin_bit=0;
+  int end_bit=sizeof(R)*CHAR_BIT;
+
+  // Allocate temporary storage for the output array
+  R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
+
+  // use cub double buffer to reduce temporary memory requirements
+  // by allowing cub to write to the begin buffer
+  cub::DoubleBuffer<R> d_keys(begin, d_out);
+
+  // Determine temporary device storage requirements
+  void* d_temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                                        temp_storage_bytes,
+                                                        d_keys,
+                                                        len,
+                                                        begin_bit,
+                                                        end_bit,
+                                                        stream));
+  // Allocate temporary storage
+  d_temp_storage =
+      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
+          temp_storage_bytes);
+
+  // Run
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                                        temp_storage_bytes,
+                                                        d_keys,
+                                                        len,
+                                                        begin_bit,
+                                                        end_bit,
+                                                        stream));
+  // Free temporary storage
+  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+
+  if (d_keys.Current() == d_out) {
+
+    // copy
+    cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream));
+  }
+
+  cuda::device_mempool_type::getInstance().free(d_out);
+
+  cuda::launch(stream);
+  if (!Async) cuda::synchronize(stream);
+}
+
+
+/*!
+        \brief static assert unimplemented sort
+*/
+template <size_t BLOCK_SIZE, bool Async, typename Iter, typename Compare>
+concepts::enable_if<concepts::negate<concepts::all_of<
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                      std::is_pointer<Iter>,
+                      concepts::any_of<
+                        camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
+                        camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+unstable(const ::RAJA::cuda_exec<BLOCK_SIZE, Async>&,
+         Iter,
+         Iter,
+         Compare)
+{
+  static_assert (std::is_pointer<Iter>::value,
+      "sort<cuda_exec> is only implemented for pointers");
+  using iterval = RAJA::detail::IterVal<Iter>;
+  static_assert (type_traits::is_arithmetic<iterval>::value,
+      "sort<cuda_exec> is only implemented for arithmetic types");
+  static_assert (concepts::any_of<
+      camp::is_same<Compare, operators::less<iterval>>,
+      camp::is_same<Compare, operators::greater<iterval>>>::value,
+      "sort<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+}
+
+/*!
+        \brief sort given range in ascending order
+*/
+template <size_t BLOCK_SIZE, bool Async, typename Iter>
+concepts::enable_if<type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                    std::is_pointer<Iter>>
+unstable(const ::RAJA::cuda_exec<BLOCK_SIZE, Async>& p,
+         Iter begin,
+         Iter end,
+         operators::less<RAJA::detail::IterVal<Iter>> comp)
+{
+  stable(p, begin, end, comp);
+}
+
+/*!
+        \brief sort given range in descending order
+*/
+template <size_t BLOCK_SIZE, bool Async, typename Iter>
+concepts::enable_if<type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                    std::is_pointer<Iter>>
+unstable(const ::RAJA::cuda_exec<BLOCK_SIZE, Async>& p,
+         Iter begin,
+         Iter end,
+         operators::greater<RAJA::detail::IterVal<Iter>> comp)
+{
+  stable(p, begin, end, comp);
+}
+
+
+/*!
+        \brief static assert unimplemented stable sort pairs
+*/
+template <size_t BLOCK_SIZE, bool Async,
+          typename KeyIter, typename ValIter, typename Compare>
+concepts::enable_if<concepts::negate<concepts::all_of<
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                      std::is_pointer<KeyIter>,
+                      std::is_pointer<ValIter>,
+                      concepts::any_of<
+                        camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
+                        camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+stable_pairs(const ::RAJA::cuda_exec<BLOCK_SIZE, Async>&,
+             KeyIter,
+             KeyIter,
+             ValIter,
+             Compare)
+{
+  static_assert (std::is_pointer<KeyIter>::value,
+      "stable_sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert (std::is_pointer<ValIter>::value,
+      "stable_sort_pairs<cuda_exec> is only implemented for pointers");
+  using K = RAJA::detail::IterVal<KeyIter>;
+  static_assert (type_traits::is_arithmetic<K>::value,
+      "stable_sort_pairs<cuda_exec> is only implemented for arithmetic types");
+  static_assert (concepts::any_of<
+      camp::is_same<Compare, operators::less<K>>,
+      camp::is_same<Compare, operators::greater<K>>>::value,
+      "stable_sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+}
+
+/*!
+        \brief stable sort given range of pairs in ascending order of keys
+*/
+template <size_t BLOCK_SIZE, bool Async,
+          typename KeyIter, typename ValIter>
+concepts::enable_if<type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                    std::is_pointer<KeyIter>,
+                    std::is_pointer<ValIter>>
+stable_pairs(const ::RAJA::cuda_exec<BLOCK_SIZE, Async>&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             operators::less<RAJA::detail::IterVal<KeyIter>>)
+{
+  cudaStream_t stream = 0;
+
+  using K = RAJA::detail::IterVal<KeyIter>;
+  using V = RAJA::detail::IterVal<ValIter>;
+
+  int len = std::distance(keys_begin, keys_end);
+  int begin_bit=0;
+  int end_bit=sizeof(K)*CHAR_BIT;
+
+  // Allocate temporary storage for the output arrays
+  K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
+  V* d_vals_out = cuda::device_mempool_type::getInstance().malloc<V>(len);
+
+  // use cub double buffer to reduce temporary memory requirements
+  // by allowing cub to write to the keys_begin and vals_begin buffers
+  cub::DoubleBuffer<K> d_keys(keys_begin, d_keys_out);
+  cub::DoubleBuffer<V> d_vals(vals_begin, d_vals_out);
+
+  // Determine temporary device storage requirements
+  void* d_temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                               temp_storage_bytes,
+                                               d_keys,
+                                               d_vals,
+                                               len,
+                                               begin_bit,
+                                               end_bit,
+                                               stream));
+  // Allocate temporary storage
+  d_temp_storage =
+      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
+          temp_storage_bytes);
+
+  // Run
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                               temp_storage_bytes,
+                                               d_keys,
+                                               d_vals,
+                                               len,
+                                               begin_bit,
+                                               end_bit,
+                                               stream));
+  // Free temporary storage
+  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+
+  if (d_keys.Current() == d_keys_out) {
+
+    // copy keys
+    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), cudaMemcpyDefault, stream));
+  }
+  if (d_vals.Current() == d_vals_out) {
+
+    // copy vals
+    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream));
+  }
+
+  cuda::device_mempool_type::getInstance().free(d_keys_out);
+  cuda::device_mempool_type::getInstance().free(d_vals_out);
+
+  cuda::launch(stream);
+  if (!Async) cuda::synchronize(stream);
+}
+
+/*!
+        \brief stable sort given range of pairs in descending order of keys
+*/
+template <size_t BLOCK_SIZE, bool Async,
+          typename KeyIter, typename ValIter>
+concepts::enable_if<type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                    std::is_pointer<KeyIter>,
+                    std::is_pointer<ValIter>>
+stable_pairs(const ::RAJA::cuda_exec<BLOCK_SIZE, Async>&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             operators::greater<RAJA::detail::IterVal<KeyIter>>)
+{
+  cudaStream_t stream = 0;
+
+  using K = RAJA::detail::IterVal<KeyIter>;
+  using V = RAJA::detail::IterVal<ValIter>;
+
+  int len = std::distance(keys_begin, keys_end);
+  int begin_bit=0;
+  int end_bit=sizeof(K)*CHAR_BIT;
+
+  // Allocate temporary storage for the output arrays
+  K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
+  V* d_vals_out = cuda::device_mempool_type::getInstance().malloc<V>(len);
+
+  // use cub double buffer to reduce temporary memory requirements
+  // by allowing cub to write to the keys_begin and vals_begin buffers
+  cub::DoubleBuffer<K> d_keys(keys_begin, d_keys_out);
+  cub::DoubleBuffer<V> d_vals(vals_begin, d_vals_out);
+
+  // Determine temporary device storage requirements
+  void* d_temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                                         temp_storage_bytes,
+                                                         d_keys,
+                                                         d_vals,
+                                                         len,
+                                                         begin_bit,
+                                                         end_bit,
+                                                         stream));
+  // Allocate temporary storage
+  d_temp_storage =
+      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
+          temp_storage_bytes);
+
+  // Run
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                                         temp_storage_bytes,
+                                                         d_keys,
+                                                         d_vals,
+                                                         len,
+                                                         begin_bit,
+                                                         end_bit,
+                                                         stream));
+  // Free temporary storage
+  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+
+  if (d_keys.Current() == d_keys_out) {
+
+    // copy keys
+    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), cudaMemcpyDefault, stream));
+  }
+  if (d_vals.Current() == d_vals_out) {
+
+    // copy vals
+    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream));
+  }
+
+  cuda::device_mempool_type::getInstance().free(d_keys_out);
+  cuda::device_mempool_type::getInstance().free(d_vals_out);
+
+  cuda::launch(stream);
+  if (!Async) cuda::synchronize(stream);
+}
+
+
+/*!
+        \brief static assert unimplemented sort pairs
+*/
+template <size_t BLOCK_SIZE, bool Async,
+          typename KeyIter, typename ValIter, typename Compare>
+concepts::enable_if<concepts::negate<concepts::all_of<
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                      std::is_pointer<KeyIter>,
+                      std::is_pointer<ValIter>,
+                      concepts::any_of<
+                        camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
+                        camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+unstable_pairs(const ::RAJA::cuda_exec<BLOCK_SIZE, Async>&,
+               KeyIter,
+               KeyIter,
+               ValIter,
+               Compare)
+{
+  static_assert (std::is_pointer<KeyIter>::value,
+      "sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert (std::is_pointer<ValIter>::value,
+      "sort_pairs<cuda_exec> is only implemented for pointers");
+  using K = RAJA::detail::IterVal<KeyIter>;
+  static_assert (type_traits::is_arithmetic<K>::value,
+      "sort_pairs<cuda_exec> is only implemented for arithmetic types");
+  static_assert (concepts::any_of<
+      camp::is_same<Compare, operators::less<K>>,
+      camp::is_same<Compare, operators::greater<K>>>::value,
+      "sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+}
+
+/*!
+        \brief stable sort given range of pairs in ascending order of keys
+*/
+template <size_t BLOCK_SIZE, bool Async,
+          typename KeyIter, typename ValIter>
+concepts::enable_if<type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                    std::is_pointer<KeyIter>,
+                    std::is_pointer<ValIter>>
+unstable_pairs(const ::RAJA::cuda_exec<BLOCK_SIZE, Async>& p,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               operators::less<RAJA::detail::IterVal<KeyIter>> comp)
+{
+  stable_pairs(p, keys_begin, keys_end, vals_begin, comp);
+}
+
+/*!
+        \brief stable sort given range of pairs in descending order of keys
+*/
+template <size_t BLOCK_SIZE, bool Async,
+          typename KeyIter, typename ValIter>
+concepts::enable_if<type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                    std::is_pointer<KeyIter>,
+                    std::is_pointer<ValIter>>
+unstable_pairs(const ::RAJA::cuda_exec<BLOCK_SIZE, Async>& p,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
+{
+  stable_pairs(p, keys_begin, keys_end, vals_begin, comp);
+}
+
+}  // namespace sort
+
+}  // namespace impl
+
+}  // namespace RAJA
+
+#endif  // closing endif for RAJA_ENABLE_CUDA guard
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip.hpp b/include/RAJA/policy/hip.hpp
index 9d94815261..f9b377ba0e 100644
--- a/include/RAJA/policy/hip.hpp
+++ b/include/RAJA/policy/hip.hpp
@@ -31,8 +31,11 @@
 #include "RAJA/policy/hip/policy.hpp"
 #include "RAJA/policy/hip/reduce.hpp"
 #include "RAJA/policy/hip/scan.hpp"
+#include "RAJA/policy/hip/sort.hpp"
 #include "RAJA/policy/hip/kernel.hpp"
 #include "RAJA/policy/hip/synchronize.hpp"
+#include "RAJA/policy/hip/WorkGroup.hpp"
+
 
 #endif  // closing endif for if defined(RAJA_ENABLE_HIP)
 
diff --git a/include/RAJA/policy/hip/WorkGroup.hpp b/include/RAJA/policy/hip/WorkGroup.hpp
new file mode 100644
index 0000000000..2a7bb0bd91
--- /dev/null
+++ b/include/RAJA/policy/hip/WorkGroup.hpp
@@ -0,0 +1,24 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA Vtable and WorkRunner constructs.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_hip_WorkGroup_HPP
+#define RAJA_hip_WorkGroup_HPP
+
+#include "RAJA/policy/hip/WorkGroup/Vtable.hpp"
+#include "RAJA/policy/hip/WorkGroup/WorkRunner.hpp"
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/WorkGroup/Vtable.hpp b/include/RAJA/policy/hip/WorkGroup/Vtable.hpp
new file mode 100644
index 0000000000..f5aaf15530
--- /dev/null
+++ b/include/RAJA/policy/hip/WorkGroup/Vtable.hpp
@@ -0,0 +1,118 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA workgroup Vtable.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_hip_WorkGroup_Vtable_HPP
+#define RAJA_hip_WorkGroup_Vtable_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/hip/policy.hpp"
+
+#include "RAJA/pattern/WorkGroup/Vtable.hpp"
+
+#include <thread>
+#include <mutex>
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+#if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
+
+// global function that gets the device function pointer and
+// writes it into a pinned ptrptr
+template < typename T, typename Vtable_T >
+__global__ void get_Vtable_hip_device_call_global(
+    typename Vtable_T::call_sig* ptrptr)
+{
+  *ptrptr = &Vtable_T::template device_call<T>;
+}
+
+// allocate the pinned ptrptr buffer
+inline void* get_Vtable_hip_device_call_ptrptr()
+{
+  void* ptrptr = nullptr;
+  hipErrchk(hipHostMalloc(&ptrptr, sizeof(typename Vtable<>::call_sig)));
+  return ptrptr;
+}
+
+// get the pinned ptrptr buffer
+inline void* get_cached_Vtable_hip_device_call_ptrptr()
+{
+  static void* ptrptr = get_Vtable_hip_device_call_ptrptr();
+  return ptrptr;
+}
+
+// mutex that guards against concurrent use of
+// get_cached_Vtable_hip_device_call_ptrptr()
+inline std::mutex& get_Vtable_hip_mutex()
+{
+  static std::mutex s_mutex;
+  return s_mutex;
+}
+
+template < typename T, typename Vtable_T >
+inline typename Vtable_T::call_sig get_Vtable_hip_device_call()
+{
+  const std::lock_guard<std::mutex> lock(get_Vtable_hip_mutex());
+
+  typename Vtable_T::call_sig* ptrptr =
+      static_cast<typename Vtable_T::call_sig*>(
+        get_cached_Vtable_hip_device_call_ptrptr());
+  auto func = get_Vtable_hip_device_call_global<T, Vtable_T>;
+  hipLaunchKernelGGL(func,
+      dim3(1), dim3(1), 0, 0, ptrptr);
+  hipErrchk(hipGetLastError());
+  hipErrchk(hipDeviceSynchronize());
+
+  return *ptrptr;
+}
+
+template < typename T, typename Vtable_T >
+inline typename Vtable_T::call_sig get_cached_Vtable_hip_device_call()
+{
+  static typename Vtable_T::call_sig ptr =
+      get_Vtable_hip_device_call<T, Vtable_T>();
+  return ptr;
+}
+
+/*!
+* Populate and return a Vtable object where the
+* call operator is a device function
+*/
+template < typename T, typename Vtable_T, size_t BLOCK_SIZE, bool Async >
+inline const Vtable_T* get_Vtable(hip_work<BLOCK_SIZE, Async> const&)
+{
+  static Vtable_T vtable{
+        &Vtable_T::template move_construct_destroy<T>,
+        get_cached_Vtable_hip_device_call<T, Vtable_T>(),
+        &Vtable_T::template destroy<T>,
+        sizeof(T)
+      };
+  return &vtable;
+}
+
+#endif
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
new file mode 100644
index 0000000000..2b4c8294a2
--- /dev/null
+++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
@@ -0,0 +1,360 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA WorkRunner class specializations.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_hip_WorkGroup_WorkRunner_HPP
+#define RAJA_hip_WorkGroup_WorkRunner_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/hip/policy.hpp"
+#include "RAJA/policy/hip/MemUtils_HIP.hpp"
+
+#include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+ * Runs work in a storage container in order
+ * and returns any per run resources
+ */
+template <size_t BLOCK_SIZE, bool Async,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::hip_work<BLOCK_SIZE, Async>,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallOrdered<
+        RAJA::hip_exec_async<BLOCK_SIZE>,
+        RAJA::hip_work<BLOCK_SIZE, Async>,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{
+  using base = WorkRunnerForallOrdered<
+        RAJA::hip_exec_async<BLOCK_SIZE>,
+        RAJA::hip_work<BLOCK_SIZE, Async>,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>;
+  using base::base;
+  using IndexType = INDEX_T;
+  using per_run_storage = typename base::per_run_storage;
+
+  ///
+  /// run the loops in the given work container in order using forall
+  /// run all loops asynchronously and synchronize after is necessary
+  ///
+  template < typename WorkContainer >
+  per_run_storage run(WorkContainer const& storage, Args... args) const
+  {
+    per_run_storage run_storage =
+        base::run(storage, std::forward<Args>(args)...);
+
+    IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
+
+    // Only synchronize if we had something to iterate over
+    if (num_loops > 0 && BLOCK_SIZE > 0) {
+      hipStream_t stream = 0; // TODO: coordinate with base to use same stream
+      if (!Async) { RAJA::hip::synchronize(stream); }
+    }
+
+    return run_storage;
+  }
+};
+
+/*!
+ * Runs work in a storage container in reverse order
+ * and returns any per run resources
+ */
+template <size_t BLOCK_SIZE, bool Async,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::hip_work<BLOCK_SIZE, Async>,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallReverse<
+        RAJA::hip_exec_async<BLOCK_SIZE>,
+        RAJA::hip_work<BLOCK_SIZE, Async>,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{
+  using base = WorkRunnerForallReverse<
+        RAJA::hip_exec_async<BLOCK_SIZE>,
+        RAJA::hip_work<BLOCK_SIZE, Async>,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>;
+  using base::base;
+  using IndexType = INDEX_T;
+  using per_run_storage = typename base::per_run_storage;
+
+  ///
+  /// run the loops in the given work container in reverse order using forall
+  /// run all loops asynchronously and synchronize after is necessary
+  ///
+  template < typename WorkContainer >
+  per_run_storage run(WorkContainer const& storage, Args... args) const
+  {
+    per_run_storage run_storage =
+        base::run(storage, std::forward<Args>(args)...);
+
+    IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
+
+    // Only synchronize if we had something to iterate over
+    if (num_loops > 0 && BLOCK_SIZE > 0) {
+      hipStream_t stream = 0; // TODO: coordinate with base to use same stream
+      if (!Async) { RAJA::hip::synchronize(stream); }
+    }
+
+    return run_storage;
+  }
+};
+
+
+#if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
+
+/*!
+ * A body and segment holder for storing loops that will be executed
+ * on the device
+ */
+template <typename Segment_type, typename LoopBody,
+          typename index_type, typename ... Args>
+struct HoldHipDeviceXThreadblockLoop
+{
+  template < typename segment_in, typename body_in >
+  HoldHipDeviceXThreadblockLoop(segment_in&& segment, body_in&& body)
+    : m_segment(std::forward<segment_in>(segment))
+    , m_body(std::forward<body_in>(body))
+  { }
+
+  RAJA_DEVICE RAJA_INLINE void operator()(Args... args) const
+  {
+    // TODO:: decide when to run hooks, may bypass this and use impl directly
+    // TODO:: decide whether or not to privatize the loop body
+    const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
+    const index_type stride  = blockDim.x * gridDim.x;
+    const auto begin = m_segment.begin();
+    const auto end   = m_segment.end();
+    const index_type len(end - begin);
+    for ( index_type i = i_begin; i < len; i += stride ) {
+      m_body(begin[i], std::forward<Args>(args)...);
+    }
+  }
+
+private:
+  Segment_type m_segment;
+  LoopBody m_body;
+};
+
+template < size_t BLOCK_SIZE,
+           typename StorageIter,
+           typename value_type,
+           typename index_type,
+           typename ... Args >
+__launch_bounds__(BLOCK_SIZE, 1) __global__
+    void hip_unordered_y_block_global(StorageIter iter, Args... args)
+{
+  const index_type i_loop = blockIdx.y;
+  // TODO: cache pointer to value_type in shared memory
+  // TODO: cache holder (value_type::obj) in shared memory
+  value_type::call(&iter[i_loop], args...);
+}
+
+
+/*!
+ * Runs work in a storage container out of order with loops mapping to
+ * hip blocks in the y direction and iterations mapping to threads in
+ * the x direction, with the number of threads in the x dimension determined
+ * by the average number of iterates per loop
+ */
+template <size_t BLOCK_SIZE, bool Async,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::hip_work<BLOCK_SIZE, Async>,
+        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{
+  using exec_policy = RAJA::hip_work<BLOCK_SIZE, Async>;
+  using order_policy = RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
+  using Allocator = ALLOCATOR_T;
+  using index_type = INDEX_T;
+
+  using vtable_type = Vtable<Args...>;
+
+  WorkRunner() = default;
+
+  WorkRunner(WorkRunner const&) = delete;
+  WorkRunner& operator=(WorkRunner const&) = delete;
+
+  WorkRunner(WorkRunner && o)
+    : m_total_iterations(o.m_total_iterations)
+  {
+    o.m_total_iterations = 0;
+  }
+  WorkRunner& operator=(WorkRunner && o)
+  {
+    m_total_iterations = o.m_total_iterations;
+
+    o.m_total_iterations = 0;
+    return *this;
+  }
+
+  // The type  that will hold the segment and loop body in work storage
+  template < typename ITERABLE, typename LOOP_BODY >
+  using holder_type = HoldHipDeviceXThreadblockLoop<ITERABLE, LOOP_BODY,
+                                 index_type, Args...>;
+
+  // The policy indicating where the call function is invoked
+  // in this case the values are called on the device
+  using vtable_exec_policy = exec_policy;
+
+  // runner interfaces with storage to enqueue so the runner can get
+  // information from the segment and loop at enqueue time
+  template < typename WorkContainer, typename Iterable, typename LoopBody >
+  inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
+  {
+    using Iterator  = camp::decay<decltype(std::begin(iter))>;
+    using LOOP_BODY = camp::decay<LoopBody>;
+    using ITERABLE  = camp::decay<Iterable>;
+    using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+
+    using holder = holder_type<ITERABLE, LOOP_BODY>;
+
+    // using true_value_type = typename WorkContainer::template true_value_type<holder>;
+
+    Iterator begin = std::begin(iter);
+    Iterator end = std::end(iter);
+    IndexType len = std::distance(begin, end);
+
+    // Only launch kernel if we have something to iterate over
+    if (len > 0 && BLOCK_SIZE > 0) {
+
+      m_total_iterations += len;
+
+      //
+      // TODO: Privatize the loop_body, using make_launch_body to setup reductions
+      //
+      // LOOP_BODY body = RAJA::hip::make_launch_body(
+      //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
+
+      storage.template emplace<holder>(
+          get_Vtable<holder, vtable_type>(vtable_exec_policy{}),
+          std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
+    }
+  }
+
+  // no extra storage required here
+  using per_run_storage = int;
+
+  template < typename WorkContainer >
+  per_run_storage run(WorkContainer const& storage, Args... args) const
+  {
+    using Iterator  = camp::decay<decltype(std::begin(storage))>;
+    using IndexType = camp::decay<decltype(std::distance(std::begin(storage), std::end(storage)))>;
+    using value_type = typename WorkContainer::value_type;
+
+    per_run_storage run_storage{};
+
+    auto func = hip_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type, index_type, Args...>;
+
+    //
+    // Compute the requested iteration space size
+    //
+    Iterator begin = std::begin(storage);
+    Iterator end = std::end(storage);
+    IndexType num_loops = std::distance(begin, end);
+
+    // Only launch kernel if we have something to iterate over
+    if (num_loops > 0 && BLOCK_SIZE > 0) {
+
+      index_type average_iterations = m_total_iterations / static_cast<index_type>(num_loops);
+
+      //
+      // Compute the number of blocks
+      //
+      constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
+      hip_dim_t blockSize{static_cast<hip_dim_member_t>(block_size), 1, 1};
+      hip_dim_t gridSize{static_cast<hip_dim_member_t>((average_iterations + block_size - 1) / block_size),
+                          static_cast<hip_dim_member_t>(num_loops),
+                          1};
+
+      RAJA_FT_BEGIN;
+
+      //
+      // Setup shared memory buffers
+      //
+      size_t shmem = 0;
+      hipStream_t stream = 0;
+
+      {
+        //
+        // Launch the kernel
+        //
+        hipLaunchKernelGGL(func,
+                           dim3(gridSize), dim3(blockSize), shmem, stream,
+                           std::move(begin),
+                           std::forward<Args>(args)...);
+        RAJA::hip::launch(stream);
+      }
+
+      if (!Async) { RAJA::hip::synchronize(stream); }
+
+      RAJA_FT_END;
+    }
+
+    return run_storage;
+  }
+
+  // clear any state so ready to be destroyed or reused
+  void clear()
+  {
+    m_total_iterations = 0;
+  }
+
+private:
+  index_type m_total_iterations = 0;
+};
+
+#endif
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/atomic.hpp b/include/RAJA/policy/hip/atomic.hpp
index 6f4d65ed40..0032408580 100644
--- a/include/RAJA/policy/hip/atomic.hpp
+++ b/include/RAJA/policy/hip/atomic.hpp
@@ -24,6 +24,7 @@
 
 #include <stdexcept>
 #include <type_traits>
+#include "hip/hip_runtime.h"
 
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/TypeConvert.hpp"
@@ -486,10 +487,45 @@ RAJA_INLINE __device__ unsigned long long hip_atomicXor<unsigned long long>(
 template <typename T>
 RAJA_INLINE __device__ T hip_atomicExchange(T volatile *acc, T value)
 {
-  // attempt to use the HIP builtin atomic, if it exists for T
-  return ::atomicExch((T *)acc, value);
+  return hip_atomic_CAS_oper(acc, [=] __device__(T) {
+    return value;
+  });
+}
+
+#if __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__
+template <>
+RAJA_INLINE __device__ int hip_atomicExchange<int>(
+    int volatile *acc, int value)
+{
+  return ::atomicExch((int *)acc, value);
 }
 
+template <>
+RAJA_INLINE __device__ unsigned hip_atomicExchange<unsigned>(
+    unsigned volatile *acc, unsigned value)
+{
+  return ::atomicExch((unsigned *)acc, value);
+}
+#endif
+
+#if __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__
+template <>
+RAJA_INLINE __device__ unsigned long long hip_atomicExchange<unsigned long long>(
+    unsigned long long volatile *acc,
+    unsigned long long value)
+{
+  return ::atomicExch((unsigned long long *)acc, value);
+}
+#endif
+
+#if __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__
+template <>
+RAJA_INLINE __device__ float hip_atomicExchange<float>(
+    float volatile *acc, float value)
+{
+  return ::atomicExch((float *)acc, value);
+}
+#endif
 
 template <typename T>
 RAJA_INLINE __device__ T hip_atomicCAS(T volatile *acc, T compare, T value)
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index f84511409c..602877b407 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -27,6 +27,7 @@
 #if defined(RAJA_ENABLE_HIP)
 
 #include <algorithm>
+#include "hip/hip_runtime.h"
 
 #include "RAJA/pattern/forall.hpp"
 
@@ -155,9 +156,10 @@ __launch_bounds__(BlockSize, 1) __global__
 //
 
 template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async>
-RAJA_INLINE void forall_impl(hip_exec<BlockSize, Async>,
-                             Iterable&& iter,
-                             LoopBody&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(resources::Hip &hip_res,
+                                                    hip_exec<BlockSize, Async>,
+                                                    Iterable&& iter,
+                                                    LoopBody&& loop_body)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
@@ -165,6 +167,8 @@ RAJA_INLINE void forall_impl(hip_exec<BlockSize, Async>,
 
   auto func = impl::forall_hip_kernel<BlockSize, Iterator, LOOP_BODY, IndexType>;
 
+  hipStream_t stream = hip_res.get_stream();
+
   //
   // Compute the requested iteration space size
   //
@@ -186,8 +190,6 @@ RAJA_INLINE void forall_impl(hip_exec<BlockSize, Async>,
     // Setup shared memory buffers
     //
     size_t shmem = 0;
-    hipStream_t stream = 0;
-
 
     //  printf("gridsize = (%d,%d), blocksize = %d\n",
     //         (int)gridSize.x,
@@ -217,8 +219,9 @@ RAJA_INLINE void forall_impl(hip_exec<BlockSize, Async>,
 
     RAJA_FT_END;
   }
-}
 
+  return resources::EventProxy<resources::Hip>(&hip_res);
+}
 
 //
 //////////////////////////////////////////////////////////////////////
diff --git a/include/RAJA/policy/hip/kernel/Conditional.hpp b/include/RAJA/policy/hip/kernel/Conditional.hpp
index fada63c0e2..e08c94e81a 100644
--- a/include/RAJA/policy/hip/kernel/Conditional.hpp
+++ b/include/RAJA/policy/hip/kernel/Conditional.hpp
@@ -38,12 +38,14 @@ namespace internal
 
 template <typename Data,
           typename Conditional,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<Data,
-                             statement::If<Conditional, EnclosedStmts...>> {
+                             statement::If<Conditional, EnclosedStmts...>,
+                             Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
-  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t>;
+  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
   static
diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp
index 20a05ae0ea..146cd24e12 100644
--- a/include/RAJA/policy/hip/kernel/For.hpp
+++ b/include/RAJA/policy/hip/kernel/For.hpp
@@ -39,24 +39,29 @@ namespace internal
 template <typename Data,
           camp::idx_t ArgumentId,
           int ThreadDim,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_thread_xyz_direct<ThreadDim>, EnclosedStmts...>> {
+    statement::For<ArgumentId, RAJA::hip_thread_xyz_direct<ThreadDim>, EnclosedStmts...>, Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-      HipStatementListExecutor<Data, stmt_list_t>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
   static
   inline
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
-    auto i = get_hip_dim<ThreadDim>(dim3(threadIdx.x,threadIdx.y,threadIdx.z));
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i = get_hip_dim<ThreadDim>(dim3(threadIdx.x,threadIdx.y,threadIdx.z));
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -70,7 +75,7 @@ struct HipStatementExecutor<
   inline
   LaunchDims calculateDimensions(Data const &data)
   {
-    auto len = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
 
     // request one thread per element in the segment
     LaunchDims dims;
@@ -93,15 +98,22 @@ struct HipStatementExecutor<
  */
 template <typename Data,
           camp::idx_t ArgumentId,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_warp_direct, EnclosedStmts...>> {
+    statement::For<ArgumentId, RAJA::hip_warp_direct, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-      HipStatementListExecutor<Data, stmt_list_t>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
 
   static
@@ -109,8 +121,8 @@ struct HipStatementExecutor<
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
-    auto i = threadIdx.x;
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i = threadIdx.x;
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -128,7 +140,7 @@ struct HipStatementExecutor<
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
 
     // we always get EXACTLY one warp by allocating one warp in the X dimension
-    int len = RAJA::policy::hip::WARP_SIZE;
+    diff_t len = RAJA::policy::hip::WARP_SIZE;
 
     // request one thread per element in the segment
     set_hip_dim<0>(dims.threads, len);
@@ -151,40 +163,45 @@ template <typename Data,
           camp::idx_t ArgumentId,
           int ThreadDim,
           int MinThreads,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_thread_xyz_loop<ThreadDim, MinThreads>, EnclosedStmts...>> {
+    statement::For<ArgumentId, RAJA::hip_thread_xyz_loop<ThreadDim, MinThreads>, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-      HipStatementListExecutor<Data, stmt_list_t>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
 
   static
   inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // block stride loop
-    auto len = segment_length<ArgumentId>(data);
-    auto i0 = get_hip_dim<ThreadDim>(dim3(threadIdx.x,threadIdx.y,threadIdx.z));
-    auto i_stride = get_hip_dim<ThreadDim>(dim3(blockDim.x,blockDim.y,blockDim.z));
-    auto i = i0;
-    for(;i < len;i += i_stride){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = get_hip_dim<ThreadDim>(dim3(threadIdx.x,threadIdx.y,threadIdx.z));
+    diff_t i_stride = get_hip_dim<ThreadDim>(dim3(blockDim.x,blockDim.y,blockDim.z));
+
+    // Iterate through block stride of chunks
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
 
       // execute enclosed statements
-      enclosed_stmts_t::exec(data, thread_active);
-    }
-    // do we need one more masked iteration?
-    if(i - i0 < len)
-    {
-      // execute enclosed statements one more time, but masking them off
-      // this is because there's at least one thread that isn't masked off
-      // that is still executing the above loop
-      enclosed_stmts_t::exec(data, false);
+      enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
 
@@ -193,7 +210,7 @@ struct HipStatementExecutor<
   inline
   LaunchDims calculateDimensions(Data const &data)
   {
-    auto len = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
 
     // request one thread per element in the segment
     LaunchDims dims;
@@ -217,40 +234,45 @@ struct HipStatementExecutor<
  */
 template <typename Data,
           camp::idx_t ArgumentId,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_warp_loop, EnclosedStmts...>> {
+    statement::For<ArgumentId, RAJA::hip_warp_loop, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-      HipStatementListExecutor<Data, stmt_list_t>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
 
   static
   inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // block stride loop
-    int len = segment_length<ArgumentId>(data);
-    int i0 = threadIdx.x;
-    auto i_stride = RAJA::policy::hip::WARP_SIZE;
-    int i = i0;
-    for(;i < len;i += i_stride){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = threadIdx.x;
+    diff_t i_stride = RAJA::policy::hip::WARP_SIZE;
+
+    // Iterate through grid stride of chunks
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
 
       // execute enclosed statements
-      enclosed_stmts_t::exec(data, thread_active);
-    }
-    // do we need one more masked iteration?
-    if(i - i0 < len)
-    {
-      // execute enclosed statements one more time, but masking them off
-      // this is because there's at least one thread that isn't masked off
-      // that is still executing the above loop
-      enclosed_stmts_t::exec(data, false);
+      enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
 
@@ -263,7 +285,7 @@ struct HipStatementExecutor<
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
 
     // we always get EXACTLY one warp by allocating one warp in the X dimension
-    int len = RAJA::policy::hip::WARP_SIZE;
+    diff_t len = RAJA::policy::hip::WARP_SIZE;
 
     // request one thread per element in the segment
     set_hip_dim<0>(dims.threads, len);
@@ -284,19 +306,26 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
   Data,
   statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
-                 EnclosedStmts ...> > {
+                 EnclosedStmts ...>,
+  Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t>;
+          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
+  using diff_t = segment_diff_type<ArgumentId, Data>;
+
   static_assert(mask_t::max_masked_size <= RAJA::policy::hip::WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
@@ -305,9 +334,9 @@ struct HipStatementExecutor<
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
 
-    auto i = mask_t::maskValue((int)threadIdx.x);
+    diff_t i = mask_t::maskValue((diff_t)threadIdx.x);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -326,7 +355,7 @@ struct HipStatementExecutor<
 
     // we always get EXACTLY one warp by allocating one warp in the X
     // dimension
-    int len = RAJA::policy::hip::WARP_SIZE;
+    diff_t len = RAJA::policy::hip::WARP_SIZE;
 
     // request one thread per element in the segment
     set_hip_dim<0>(dims.threads, len);
@@ -348,19 +377,26 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
   Data,
   statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
-                 EnclosedStmts ...> > {
+                 EnclosedStmts ...>,
+  Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t>;
+          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
+  using diff_t = segment_diff_type<ArgumentId, Data>;
+
   static_assert(mask_t::max_masked_size <= RAJA::policy::hip::WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
@@ -370,24 +406,24 @@ struct HipStatementExecutor<
   void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
-    int len = segment_length<ArgumentId>(data);
-    int i = mask_t::maskValue((int)threadIdx.x);
-    for( ; i < len; i += (int) mask_t::max_masked_size){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    diff_t i_stride = (diff_t) mask_t::max_masked_size;
+
+    // Iterate through grid stride of chunks
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
 
       // execute enclosed statements
-      enclosed_stmts_t::exec(data, thread_active);
+      enclosed_stmts_t::exec(data, thread_active && have_work);
     }
-    // do we need one more masked iteration?
-    if(i - mask_t::maskValue((int)threadIdx.x) < len){
-      // execute enclosed statements one more time, but masking them off
-      // this is because there's at least one thread that isn't masked off
-      // that is still executing the above loop
-      enclosed_stmts_t::exec(data, false);
-    }
-
   }
 
 
@@ -400,7 +436,7 @@ struct HipStatementExecutor<
 
     // we always get EXACTLY one warp by allocating one warp in the X
     // dimension
-    int len = RAJA::policy::hip::WARP_SIZE;
+    diff_t len = RAJA::policy::hip::WARP_SIZE;
 
     // request one thread per element in the segment
     set_hip_dim<0>(dims.threads, len);
@@ -421,27 +457,34 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
   Data,
   statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
-                 EnclosedStmts ...> > {
+                 EnclosedStmts ...>,
+  Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t>;
+          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
+  using diff_t = segment_diff_type<ArgumentId, Data>;
+
   static
   inline
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
 
-    auto i = mask_t::maskValue((int)threadIdx.x);
+    diff_t i = mask_t::maskValue((diff_t)threadIdx.x);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -460,7 +503,7 @@ struct HipStatementExecutor<
 
     // we need to allocate enough threads for the segment size, and the
     // shifted off bits
-    int len = mask_t::max_input_size;
+    diff_t len = mask_t::max_input_size;
 
     // request one thread per element in the segment
     set_hip_dim<0>(dims.threads, len);
@@ -484,19 +527,26 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
   Data,
   statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
-                 EnclosedStmts ...> > {
+                 EnclosedStmts ...>,
+  Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t>;
+          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
+  using diff_t = segment_diff_type<ArgumentId, Data>;
+
 
   static
   inline
@@ -504,24 +554,24 @@ struct HipStatementExecutor<
   void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
-    int len = segment_length<ArgumentId>(data);
-    int i = mask_t::maskValue((int)threadIdx.x);
-    for( ; i < len; i += (int) mask_t::max_masked_size){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    diff_t i_stride = (diff_t) mask_t::max_masked_size;
+
+    // Iterate through grid stride of chunks
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
 
       // execute enclosed statements
-      enclosed_stmts_t::exec(data, thread_active);
+      enclosed_stmts_t::exec(data, thread_active && have_work);
     }
-    // do we need one more masked iteration?
-    if(i - mask_t::maskValue((int)threadIdx.x) < len){
-      // execute enclosed statements one more time, but masking them off
-      // this is because there's at least one thread that isn't masked off
-      // that is still executing the above loop
-      enclosed_stmts_t::exec(data, false);
-    }
-
   }
 
 
@@ -534,7 +584,7 @@ struct HipStatementExecutor<
 
     // we need to allocate enough threads for the segment size, and the
     // shifted off bits
-    int len = mask_t::max_input_size;
+    diff_t len = mask_t::max_input_size;
 
     // request one thread per element in the segment
     set_hip_dim<0>(dims.threads, len);
@@ -548,6 +598,68 @@ struct HipStatementExecutor<
 };
 
 
+/*
+ * Executor for block work sharing inside HipKernel.
+ * Mapping directly from blockIdx.xyz to indicies
+ * Assigns the loop index to offset ArgumentId
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          int BlockDim,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<
+    Data,
+    statement::For<ArgumentId, RAJA::hip_block_xyz_direct<BlockDim>, EnclosedStmts...>,
+    Types> {
+
+  using stmt_list_t = StatementList<EnclosedStmts...>;
+
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
+  using enclosed_stmts_t =
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
+
+
+  static
+  inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  {
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i = get_hip_dim<BlockDim>(dim3(blockIdx.x,blockIdx.y,blockIdx.z));
+
+    if (i < len) {
+
+      // Assign the x thread to the argument
+      data.template assign_offset<ArgumentId>(i);
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, thread_active);
+    }
+  }
+
+
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
+  {
+    diff_t len = segment_length<ArgumentId>(data);
+
+    // request one block per element in the segment
+    LaunchDims dims;
+    set_hip_dim<BlockDim>(dims.blocks, len);
+
+    // since we are direct-mapping, we REQUIRE len
+    set_hip_dim<BlockDim>(dims.min_blocks, len);
+
+    // combine with enclosed statements
+    LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
+    return dims.max(enclosed_dims);
+  }
+};
+
 /*
  * Executor for block work sharing inside HipKernel.
  * Provides a grid-stride loop (stride of gridDim.xyz) for
@@ -557,25 +669,34 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           int BlockDim,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_block_xyz_loop<BlockDim>, EnclosedStmts...>> {
+    statement::For<ArgumentId, RAJA::hip_block_xyz_loop<BlockDim>, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-      HipStatementListExecutor<Data, stmt_list_t>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
 
   static
   inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // grid stride loop
-    auto len = segment_length<ArgumentId>(data);
-    auto i0 = get_hip_dim<BlockDim>(dim3(blockIdx.x,blockIdx.y,blockIdx.z));
-    auto i_stride = get_hip_dim<BlockDim>(dim3(gridDim.x,gridDim.y,gridDim.z));
-    for(auto i = i0;i < len;i += i_stride){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = get_hip_dim<BlockDim>(dim3(blockIdx.x,blockIdx.y,blockIdx.z));
+    diff_t i_stride = get_hip_dim<BlockDim>(dim3(gridDim.x,gridDim.y,gridDim.z));
+
+    // Iterate through grid stride of chunks
+    for(diff_t i = i_init;i < len;i += i_stride){
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -590,7 +711,7 @@ struct HipStatementExecutor<
   inline
   LaunchDims calculateDimensions(Data const &data)
   {
-    auto len = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
@@ -612,27 +733,31 @@ struct HipStatementExecutor<
  */
 template <typename Data,
           camp::idx_t ArgumentId,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, seq_exec, EnclosedStmts...> > {
+    statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-      HipStatementListExecutor<Data, stmt_list_t>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
   static
   inline
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
+    diff_t len = segment_length<ArgumentId>(data);
 
-    using idx_type = camp::decay<decltype(camp::get<ArgumentId>(data.offset_tuple))>;
-
-    idx_type len = segment_length<ArgumentId>(data);
-
-    for(idx_type i = 0;i < len;++ i){
+    for(diff_t i = 0;i < len;++ i){
       // Assign i to the argument
       data.template assign_offset<ArgumentId>(i);
 
@@ -657,4 +782,4 @@ struct HipStatementExecutor<
 }  // end namespace RAJA
 
 
-#endif /* RAJA_pattern_kernel_HPP */
+#endif /* RAJA_policy_hip_kernel_For_HPP */
diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp
index 6c80265737..bcd0de97a1 100644
--- a/include/RAJA/policy/hip/kernel/ForICount.hpp
+++ b/include/RAJA/policy/hip/kernel/ForICount.hpp
@@ -42,27 +42,31 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           int ThreadDim,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::hip_thread_xyz_direct<ThreadDim>, EnclosedStmts...>>
+    statement::ForICount<ArgumentId, ParamId, RAJA::hip_thread_xyz_direct<ThreadDim>, EnclosedStmts...>,
+    Types>
     : public HipStatementExecutor<
         Data,
-        statement::For<ArgumentId, RAJA::hip_thread_xyz_direct<ThreadDim>, EnclosedStmts...>> {
+        statement::For<ArgumentId, RAJA::hip_thread_xyz_direct<ThreadDim>, EnclosedStmts...>, Types> {
 
   using Base = HipStatementExecutor<
         Data,
-        statement::For<ArgumentId, RAJA::hip_thread_xyz_direct<ThreadDim>, EnclosedStmts...>>;
+        statement::For<ArgumentId, RAJA::hip_thread_xyz_direct<ThreadDim>, EnclosedStmts...>,
+        Types>;
 
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
   static
   inline
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
-    auto i = get_hip_dim<ThreadDim>(dim3(threadIdx.x,threadIdx.y,threadIdx.z));
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i = get_hip_dim<ThreadDim>(dim3(threadIdx.x,threadIdx.y,threadIdx.z));
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -87,30 +91,33 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
   Data,
   statement::ForICount<ArgumentId, ParamId, RAJA::hip_warp_direct,
-                       EnclosedStmts ...> >
+                       EnclosedStmts ...>,
+  Types>
   : public HipStatementExecutor<
     Data,
     statement::For<ArgumentId, RAJA::hip_warp_direct,
-                   EnclosedStmts ...> > {
+                   EnclosedStmts ...>, Types > {
 
   using Base = HipStatementExecutor<
           Data,
           statement::For<ArgumentId, RAJA::hip_warp_direct,
-                         EnclosedStmts ...> >;
+                         EnclosedStmts ...>, Types >;
 
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
   static
   inline
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
-    auto i = get_hip_dim<0>(dim3(threadIdx.x,threadIdx.y,threadIdx.z));
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i = get_hip_dim<0>(dim3(threadIdx.x,threadIdx.y,threadIdx.z));
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -132,22 +139,24 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
   Data,
   statement::ForICount<ArgumentId, ParamId, RAJA::hip_warp_loop,
-                       EnclosedStmts ...> >
+                       EnclosedStmts ...>, Types >
   : public HipStatementExecutor<
     Data,
     statement::For<ArgumentId, RAJA::hip_warp_loop,
-                   EnclosedStmts ...> > {
+                   EnclosedStmts ...>, Types > {
 
   using Base = HipStatementExecutor<
           Data,
           statement::For<ArgumentId, RAJA::hip_warp_loop,
-                         EnclosedStmts ...> >;
+                         EnclosedStmts ...>, Types >;
 
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
   static
   inline
@@ -155,27 +164,24 @@ struct HipStatementExecutor<
   void exec(Data &data, bool thread_active)
   {
     // block stride loop
-    int len = segment_length<ArgumentId>(data);
-    //auto i0 = threadIdx.x;
-    //auto i_stride = RAJA::policy::hip::WARP_SIZE;
-    //auto i = i0;
-    auto &i = camp::get<ArgumentId>(data.offset_tuple);
-    i = threadIdx.x;
-    for( ; i < len; i += RAJA::policy::hip::WARP_SIZE){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = threadIdx.x;
+    diff_t i_stride = RAJA::policy::hip::WARP_SIZE;
+
+    // Iterate through grid stride of chunks
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
 
       // Assign the x thread to the argument
-      //  data.template assign_offset<ArgumentId>(i);
+      data.template assign_offset<ArgumentId>(i);
       data.template assign_param<ParamId>(i);
 
       // execute enclosed statements
-      enclosed_stmts_t::exec(data, thread_active);
-    }
-    // do we need one more masked iteration?
-    if(i - threadIdx.x < len){
-      // execute enclosed statements one more time, but masking them off
-      // this is because there's at least one thread that isn't masked off
-      // that is still executing the above loop
-      enclosed_stmts_t::exec(data, false);
+      enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
 };
@@ -190,26 +196,32 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
   Data,
   statement::ForICount<ArgumentId, ParamId,
                        RAJA::hip_warp_masked_direct<Mask>,
-                       EnclosedStmts ...> >
+                       EnclosedStmts ...>, Types >
   : public HipStatementExecutor<
     Data,
     statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
-                   EnclosedStmts ...> > {
+                   EnclosedStmts ...>, Types > {
 
   using Base = HipStatementExecutor<
           Data,
           statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
-                         EnclosedStmts ...> >;
+                         EnclosedStmts ...>, Types >;
+
+  using typename Base::diff_t;
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t>;
+          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
@@ -221,9 +233,9 @@ struct HipStatementExecutor<
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
 
-    auto i = mask_t::maskValue((int)threadIdx.x);
+    diff_t i = mask_t::maskValue((diff_t)threadIdx.x);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -246,26 +258,32 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
   Data,
   statement::ForICount<ArgumentId, ParamId,
                        RAJA::hip_warp_masked_loop<Mask>,
-                       EnclosedStmts ...> >
+                       EnclosedStmts ...>, Types >
   : public HipStatementExecutor<
     Data,
     statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
-                   EnclosedStmts ...> > {
+                   EnclosedStmts ...>, Types > {
 
   using Base = HipStatementExecutor<
           Data,
           statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
-                         EnclosedStmts ...> >;
+                         EnclosedStmts ...>, Types >;
+
+  using typename Base::diff_t;
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t>;
+          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
@@ -278,23 +296,24 @@ struct HipStatementExecutor<
   void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
-    int len = segment_length<ArgumentId>(data);
-    auto i = mask_t::maskValue((int)threadIdx.x);
-    for( ; i < len; i += (int) mask_t::max_masked_size){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    diff_t i_stride = (diff_t) mask_t::max_masked_size;
+
+    // Iterate through grid stride of chunks
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
 
       // Assign the x thread to the argument and param
       data.template assign_offset<ArgumentId>(i);
       data.template assign_param<ParamId>(i);
 
       // execute enclosed statements
-      enclosed_stmts_t::exec(data, thread_active);
-    }
-    // do we need one more masked iteration?
-    if(i - mask_t::maskValue((int)threadIdx.x) < len){
-      // execute enclosed statements one more time, but masking them off
-      // this is because there's at least one thread that isn't masked off
-      // that is still executing the above loop
-      enclosed_stmts_t::exec(data, false);
+      enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
 
@@ -312,26 +331,32 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
   Data,
   statement::ForICount<ArgumentId, ParamId,
                        RAJA::hip_thread_masked_direct<Mask>,
-                       EnclosedStmts ...> >
+                       EnclosedStmts ...>, Types >
   : public HipStatementExecutor<
     Data,
     statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
-                   EnclosedStmts ...> > {
+                   EnclosedStmts ...>, Types > {
 
   using Base = HipStatementExecutor<
           Data,
           statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
-                         EnclosedStmts ...> >;
+                         EnclosedStmts ...>, Types >;
+
+  using typename Base::diff_t;
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t>;
+          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
@@ -340,9 +365,9 @@ struct HipStatementExecutor<
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
 
-    auto i = mask_t::maskValue((int)threadIdx.x);
+    diff_t i = mask_t::maskValue((diff_t)threadIdx.x);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -367,26 +392,32 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
   Data,
   statement::ForICount<ArgumentId, ParamId,
                        RAJA::hip_thread_masked_loop<Mask>,
-                       EnclosedStmts ...> >
+                       EnclosedStmts ...>, Types >
   : public HipStatementExecutor<
     Data,
     statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
-                   EnclosedStmts ...> > {
+                   EnclosedStmts ...>, Types > {
 
   using Base = HipStatementExecutor<
           Data,
           statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
-                         EnclosedStmts ...> >;
+                         EnclosedStmts ...>, Types >;
+
+  using typename Base::diff_t;
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t>;
+          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
@@ -396,23 +427,24 @@ struct HipStatementExecutor<
   void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
-    int len = segment_length<ArgumentId>(data);
-    int i = mask_t::maskValue((int)threadIdx.x);
-    for( ; i < len; i += (int) mask_t::max_masked_size){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    diff_t i_stride = (diff_t) mask_t::max_masked_size;
+
+    // Iterate through grid stride of chunks
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
       data.template assign_param<ParamId>(i);
 
       // execute enclosed statements
-      enclosed_stmts_t::exec(data, thread_active);
-    }
-    // do we need one more masked iteration?
-    if(i - mask_t::maskValue((int)threadIdx.x) < len){
-      // execute enclosed statements one more time, but masking them off
-      // this is because there's at least one thread that isn't masked off
-      // that is still executing the above loop
-      enclosed_stmts_t::exec(data, false);
+      enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
 
@@ -434,50 +466,100 @@ template <typename Data,
           typename ParamId,
           int ThreadDim,
           int MinThreads,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::hip_thread_xyz_loop<ThreadDim, MinThreads>, EnclosedStmts...>>
+    statement::ForICount<ArgumentId, ParamId, RAJA::hip_thread_xyz_loop<ThreadDim, MinThreads>, EnclosedStmts...>,
+    Types>
     : public HipStatementExecutor<
         Data,
-        statement::For<ArgumentId, RAJA::hip_thread_xyz_loop<ThreadDim, MinThreads>, EnclosedStmts...>> {
+        statement::For<ArgumentId, RAJA::hip_thread_xyz_loop<ThreadDim, MinThreads>, EnclosedStmts...>,
+        Types> {
 
   using Base = HipStatementExecutor<
         Data,
-        statement::For<ArgumentId, RAJA::hip_thread_xyz_loop<ThreadDim, MinThreads>, EnclosedStmts...>>;
+        statement::For<ArgumentId, RAJA::hip_thread_xyz_loop<ThreadDim, MinThreads>, EnclosedStmts...>,
+        Types>;
 
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
   static
   inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // block stride loop
-    auto len = segment_length<ArgumentId>(data);
-    auto i0 = get_hip_dim<ThreadDim>(dim3(threadIdx.x,threadIdx.y,threadIdx.z));
-    auto i_stride = get_hip_dim<ThreadDim>(dim3(blockDim.x,blockDim.y,blockDim.z));
-    auto i = i0;
-    for(;i < len;i += i_stride){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = get_hip_dim<ThreadDim>(dim3(threadIdx.x,threadIdx.y,threadIdx.z));
+    diff_t i_stride = get_hip_dim<ThreadDim>(dim3(blockDim.x,blockDim.y,blockDim.z));
+
+    // Iterate through grid stride of chunks
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
       data.template assign_param<ParamId>(i);
 
       // execute enclosed statements
-      enclosed_stmts_t::exec(data, thread_active);
-    }
-    // do we need one more masked iteration?
-    if(i - i0 < len)
-    {
-      // execute enclosed statements one more time, but masking them off
-      // this is because there's at least one thread that isn't masked off
-      // that is still executing the above loop
-      enclosed_stmts_t::exec(data, false);
+      enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
 };
 
 
 
+/*
+ * Executor for block work sharing inside HipKernel.
+ * Provides a direct mapping of each block in xyz.
+ * Assigns the loop index to offset ArgumentId
+ * Assigns the loop index to param ParamId
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          int BlockDim,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<
+    Data,
+    statement::ForICount<ArgumentId, ParamId, RAJA::hip_block_xyz_direct<BlockDim>, EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
+        Data,
+        statement::For<ArgumentId, RAJA::hip_block_xyz_direct<BlockDim>, EnclosedStmts...>,
+        Types> {
+
+  using Base = HipStatementExecutor<
+      Data,
+      statement::For<ArgumentId, RAJA::hip_block_xyz_direct<BlockDim>, EnclosedStmts...>,
+      Types>;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static
+  inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  {
+    // grid stride loop
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i = get_hip_dim<BlockDim>(blockIdx);
+
+    if (i < len) {
+
+      // Assign the x thread to the argument
+      data.template assign_offset<ArgumentId>(i);
+      data.template assign_param<ParamId>(i);
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, thread_active);
+    }
+  }
+};
 
 /*
  * Executor for block work sharing inside HipKernel.
@@ -490,28 +572,35 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           int BlockDim,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::hip_block_xyz_loop<BlockDim>, EnclosedStmts...>>
+    statement::ForICount<ArgumentId, ParamId, RAJA::hip_block_xyz_loop<BlockDim>, EnclosedStmts...>,
+    Types>
     : public HipStatementExecutor<
         Data,
-        statement::For<ArgumentId, RAJA::hip_block_xyz_loop<BlockDim>, EnclosedStmts...>> {
+        statement::For<ArgumentId, RAJA::hip_block_xyz_loop<BlockDim>, EnclosedStmts...>,
+        Types> {
 
   using Base = HipStatementExecutor<
       Data,
-      statement::For<ArgumentId, RAJA::hip_block_xyz_loop<BlockDim>, EnclosedStmts...>>;
+      statement::For<ArgumentId, RAJA::hip_block_xyz_loop<BlockDim>, EnclosedStmts...>,
+      Types>;
 
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
   static
   inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // grid stride loop
-    auto len = segment_length<ArgumentId>(data);
-    auto i0 = get_hip_dim<BlockDim>(blockIdx);
-    auto i_stride = get_hip_dim<BlockDim>(gridDim);
-    for(auto i = i0;i < len;i += i_stride){
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = get_hip_dim<BlockDim>(blockIdx);
+    diff_t i_stride = get_hip_dim<BlockDim>(gridDim);
+
+    // Iterate through grid stride of chunks
+    for (diff_t i = i_init; i < len; i += i_stride) {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -534,30 +623,30 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...> >
+    statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>, Types >
     : public HipStatementExecutor<
         Data,
-        statement::For<ArgumentId, seq_exec, EnclosedStmts...> > {
+        statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types > {
 
   using Base = HipStatementExecutor<
       Data,
-      statement::For<ArgumentId, seq_exec, EnclosedStmts...> >;
+      statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types >;
 
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
   static
   inline
   RAJA_DEVICE
   void exec(Data &data, bool thread_active)
   {
-    using idx_type = camp::decay<decltype(camp::get<ArgumentId>(data.offset_tuple))>;
-
-    idx_type len = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
 
-    for(idx_type i = 0;i < len;++ i){
+    for(diff_t i = 0;i < len;++ i){
       // Assign i to the argument
       data.template assign_offset<ArgumentId>(i);
       data.template assign_param<ParamId>(i);
@@ -576,4 +665,4 @@ struct HipStatementExecutor<
 }  // end namespace RAJA
 
 
-#endif /* RAJA_pattern_kernel_HPP */
+#endif /* RAJA_policy_hip_kernel_ForICount_HPP */
diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index c8a89dadaa..93eeab5912 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -38,8 +38,6 @@
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
 #include "RAJA/policy/hip/policy.hpp"
 
-#include "RAJA/internal/LegacyCompatibility.hpp"
-
 #include "RAJA/policy/hip/kernel/internal.hpp"
 
 namespace RAJA
@@ -242,7 +240,7 @@ struct HipKernelLauncherGetter<0, Data, executor_t>
  * Helper class that handles HIP kernel launching, and computing
  * maximum number of threads/blocks
  */
-template<typename LaunchPolicy, typename StmtList, typename Data>
+template<typename LaunchPolicy, typename StmtList, typename Data, typename Types>
 struct HipLaunchHelper;
 
 
@@ -251,14 +249,14 @@ struct HipLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the HIP occupancy calculator.
  */
-template<bool async0, int num_blocks, int num_threads, typename StmtList, typename Data>
-struct HipLaunchHelper<hip_launch<async0, num_blocks, num_threads>,StmtList,Data>
+template<bool async0, int num_blocks, int num_threads, typename StmtList, typename Data, typename Types>
+struct HipLaunchHelper<hip_launch<async0, num_blocks, num_threads>,StmtList,Data,Types>
 {
   using Self = HipLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::hip_statement_list_executor_t<StmtList, Data>;
+  using executor_t = internal::hip_statement_list_executor_t<StmtList, Data, Types>;
 
   using kernelGetter_t = HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, Data, executor_t>;
 
@@ -433,9 +431,9 @@ hip_dim_t fitHipDims(unsigned int limit, hip_dim_t result, hip_dim_t minimum = h
 /*!
  * Specialization that launches HIP kernels for RAJA::kernel from host code
  */
-template <typename LaunchConfig, typename... EnclosedStmts>
+template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::HipKernelExt<LaunchConfig, EnclosedStmts...>> {
+    statement::HipKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using StatementType =
@@ -446,8 +444,8 @@ struct StatementExecutor<
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t = hip_statement_list_executor_t<stmt_list_t, data_t>;
-    using launch_t = HipLaunchHelper<LaunchConfig, stmt_list_t, data_t>;
+    using executor_t = hip_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using launch_t = HipLaunchHelper<LaunchConfig, stmt_list_t, data_t, Types>;
 
 
     //
@@ -539,9 +537,10 @@ struct StatementExecutor<
       //
       // make sure that we fit
       //
+      /* Doesn't make sense to check this anymore - AJK
       if(launch_dims.num_blocks() > max_blocks){
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num blocks");
-      }
+      }*/
       if(launch_dims.num_threads() > max_threads){
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num threads");
       }
diff --git a/include/RAJA/policy/hip/kernel/Hyperplane.hpp b/include/RAJA/policy/hip/kernel/Hyperplane.hpp
index 2e7ab124b3..0c02648726 100644
--- a/include/RAJA/policy/hip/kernel/Hyperplane.hpp
+++ b/include/RAJA/policy/hip/kernel/Hyperplane.hpp
@@ -39,15 +39,21 @@ namespace internal
 template <typename Data,
           camp::idx_t HpArgumentId,
           camp::idx_t... Args,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<Data,
                              statement::Hyperplane<HpArgumentId,
                                                    seq_exec,
                                                    ArgList<Args...>,
-                                                   EnclosedStmts...>> {
+                                                   EnclosedStmts...>,
+                                                   Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
-  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t>;
+
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, HpArgumentId, Data>;
+
+  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   static
   inline
@@ -57,10 +63,10 @@ struct HipStatementExecutor<Data,
     // compute Manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
     int hp_len = segment_length<HpArgumentId>(data) +
-                 VarOps::foldl(RAJA::operators::plus<int>(),
+                 foldl(RAJA::operators::plus<int>(),
                                segment_length<Args>(data)...);
 
-    int h_args = VarOps::foldl(RAJA::operators::plus<idx_t>(),
+    int h_args = foldl(RAJA::operators::plus<idx_t>(),
         camp::get<Args>(data.offset_tuple)...);
 
     // get length of i dimension
diff --git a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
index 4ddb00408d..814dd10038 100644
--- a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
@@ -40,12 +40,12 @@ namespace internal
 {
 
 //Intialize thread shared array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts>
-struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_shared_mem, camp::idx_seq<Indices...>, EnclosedStmts...>>
+template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_shared_mem, camp::idx_seq<Indices...>, EnclosedStmts...>, Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
-  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t>;
+  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
   //Launch loops
@@ -129,12 +129,12 @@ struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_shared_mem,
 };
 
 //Intialize thread private array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts>
-struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_thread_mem, camp::idx_seq<Indices...>, EnclosedStmts...>>
+template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_thread_mem, camp::idx_seq<Indices...>, EnclosedStmts...>, Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
-  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t>;
+  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
   //Launch loops
diff --git a/include/RAJA/policy/hip/kernel/Lambda.hpp b/include/RAJA/policy/hip/kernel/Lambda.hpp
index d8ddd6355e..b1c0da8423 100644
--- a/include/RAJA/policy/hip/kernel/Lambda.hpp
+++ b/include/RAJA/policy/hip/kernel/Lambda.hpp
@@ -40,15 +40,15 @@ namespace RAJA
 namespace internal
 {
 
-template <typename Data, camp::idx_t LambdaIndex>
-struct HipStatementExecutor<Data, statement::Lambda<LambdaIndex>> {
+template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
+struct HipStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
 
   static
   inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Only execute the lambda if it hasn't been masked off
     if(thread_active){
-      invoke_lambda<LambdaIndex>(data);
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
     }
   }
 
@@ -61,34 +61,6 @@ struct HipStatementExecutor<Data, statement::Lambda<LambdaIndex>> {
   }
 };
 
-//
-
-template <typename Data, camp::idx_t LambdaIndex, typename... Args>
-struct HipStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>> {
-
-  static
-  inline RAJA_DEVICE void exec(Data &data, bool thread_active)
-  {
-
-    //Convert SegList, ParamList into Seg, Param types, and store in a list
-    using targList = typename camp::flatten<camp::list<Args...>>::type;
-
-    // Only execute the lambda if it hasn't been masked off
-    if(thread_active){
-      invoke_lambda_with_args<LambdaIndex, targList>(data);
-    }
-
-  }
-
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
-  {
-    return LaunchDims();
-  }
-};
-
 
 
 
diff --git a/include/RAJA/policy/hip/kernel/Reduce.hpp b/include/RAJA/policy/hip/kernel/Reduce.hpp
index 5f3632320e..1e61a5e003 100644
--- a/include/RAJA/policy/hip/kernel/Reduce.hpp
+++ b/include/RAJA/policy/hip/kernel/Reduce.hpp
@@ -37,16 +37,18 @@ namespace internal
 template <typename Data,
           template <typename...> class ReduceOperator,
           typename ParamId,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<Data,
                              statement::Reduce<RAJA::hip_block_reduce,
                                                ReduceOperator,
                                                ParamId,
-                                               EnclosedStmts...>> {
+                                               EnclosedStmts...>,
+                           Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
-  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t>;
+  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
   static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
@@ -94,16 +96,18 @@ struct HipStatementExecutor<Data,
 template <typename Data,
           template <typename...> class ReduceOperator,
           typename ParamId,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<Data,
                              statement::Reduce<RAJA::hip_warp_reduce,
                                                ReduceOperator,
                                                ParamId,
-                                               EnclosedStmts...>> {
+                                               EnclosedStmts...>,
+                            Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
-  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t>;
+  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
   static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
diff --git a/include/RAJA/policy/hip/kernel/Sync.hpp b/include/RAJA/policy/hip/kernel/Sync.hpp
index 84f01318af..324819fc68 100644
--- a/include/RAJA/policy/hip/kernel/Sync.hpp
+++ b/include/RAJA/policy/hip/kernel/Sync.hpp
@@ -57,8 +57,8 @@ struct HipSyncWarp : public internal::Statement<camp::nil> {
 namespace internal
 {
 
-template <typename Data>
-struct HipStatementExecutor<Data, statement::HipSyncThreads> {
+template <typename Data, typename Types>
+struct HipStatementExecutor<Data, statement::HipSyncThreads, Types> {
 
   static
   inline
@@ -74,8 +74,8 @@ struct HipStatementExecutor<Data, statement::HipSyncThreads> {
   }
 };
 
-template <typename Data>
-struct HipStatementExecutor<Data, statement::HipSyncWarp> {
+template <typename Data, typename Types>
+struct HipStatementExecutor<Data, statement::HipSyncWarp, Types> {
 
   static
   inline
diff --git a/include/RAJA/policy/hip/kernel/Tile.hpp b/include/RAJA/policy/hip/kernel/Tile.hpp
index d8448b3d1b..10a2ef1dd1 100644
--- a/include/RAJA/policy/hip/kernel/Tile.hpp
+++ b/include/RAJA/policy/hip/kernel/Tile.hpp
@@ -50,14 +50,16 @@ namespace internal
 template <typename Data,
           camp::idx_t ArgumentId,
           typename TPol,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>>
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
-  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t>;
+  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
   static
   inline
@@ -70,13 +72,13 @@ struct HipStatementExecutor<
     using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
-    int chunk_size = TPol::chunk_size;
+    diff_t chunk_size = TPol::chunk_size;
 
     // compute trip count
-    int len = segment.end() - segment.begin();
+    diff_t len = segment.end() - segment.begin();
 
     // Iterate through tiles
-    for (int i = 0; i < len; i += chunk_size) {
+    for (diff_t i = 0; i < len; i += chunk_size) {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -123,18 +125,116 @@ template <typename Data,
           camp::idx_t ArgumentId,
           camp::idx_t chunk_size,
           int BlockDim,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
-                    RAJA::statement::tile_fixed<chunk_size>,
+                    RAJA::tile_fixed<chunk_size>,
+                    hip_block_xyz_direct<BlockDim>,
+                    EnclosedStmts...>,
+                    Types>
+  {
+
+  using stmt_list_t = StatementList<EnclosedStmts...>;
+
+  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
+
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
+  {
+    // Get the segment referenced by this Tile statement
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+
+    using segment_t = camp::decay<decltype(segment)>;
+
+    // compute trip count
+    diff_t len = segment.end() - segment.begin();
+    diff_t i = get_hip_dim<BlockDim>(dim3(blockIdx.x,blockIdx.y,blockIdx.z)) * chunk_size;
+
+    // check have chunk
+    if (i < len) {
+
+      // Keep copy of original segment, so we can restore it
+      segment_t orig_segment = segment;
+
+      // Assign our new tiled segment
+      segment = orig_segment.slice(i, chunk_size);
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, thread_active);
+
+      // Set range back to original values
+      segment = orig_segment;
+    }
+  }
+
+
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
+  {
+
+    // Compute how many blocks
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t num_blocks = len / chunk_size;
+    if (num_blocks * chunk_size < len) {
+      num_blocks++;
+    }
+
+    LaunchDims dims;
+    set_hip_dim<BlockDim>(dims.blocks, num_blocks);
+
+    // since we are direct-mapping, we REQUIRE len
+    set_hip_dim<BlockDim>(dims.min_blocks, num_blocks);
+
+
+    // privatize data, so we can mess with the segments
+    using data_t = camp::decay<Data>;
+    data_t private_data = data;
+
+    // Get original segment
+    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+
+    // restrict to first tile
+    segment = segment.slice(0, chunk_size);
+
+
+    LaunchDims enclosed_dims =
+        enclosed_stmts_t::calculateDimensions(private_data);
+
+    return dims.max(enclosed_dims);
+  }
+};
+
+/*!
+ * A specialized RAJA::kernel hip_impl executor for statement::Tile
+ * Assigns the tile segment to segment ArgumentId
+ *
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          camp::idx_t chunk_size,
+          int BlockDim,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<
+    Data,
+    statement::Tile<ArgumentId,
+                    RAJA::tile_fixed<chunk_size>,
                     hip_block_xyz_loop<BlockDim>,
-                    EnclosedStmts...>>
+                    EnclosedStmts...>, Types>
   {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
-  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t>;
+  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
   static
   inline
@@ -149,12 +249,12 @@ struct HipStatementExecutor<
     segment_t orig_segment = segment;
 
     // compute trip count
-    auto len = segment.end() - segment.begin();
-    auto i0 = get_hip_dim<BlockDim>(dim3(blockIdx.x,blockIdx.y,blockIdx.z)) * chunk_size;
-    auto i_stride = get_hip_dim<BlockDim>(dim3(gridDim.x,gridDim.y,gridDim.z)) * chunk_size;
+    diff_t len = segment.end() - segment.begin();
+    diff_t i_init = get_hip_dim<BlockDim>(dim3(blockIdx.x,blockIdx.y,blockIdx.z)) * chunk_size;
+    diff_t i_stride = get_hip_dim<BlockDim>(dim3(gridDim.x,gridDim.y,gridDim.z)) * chunk_size;
 
     // Iterate through grid stride of chunks
-    for (int i = i0; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride) {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -174,8 +274,8 @@ struct HipStatementExecutor<
   {
 
     // Compute how many blocks
-    int len = segment_length<ArgumentId>(data);
-    int num_blocks = len / chunk_size;
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t num_blocks = len / chunk_size;
     if (num_blocks * chunk_size < len) {
       num_blocks++;
     }
@@ -214,17 +314,20 @@ template <typename Data,
           camp::idx_t ArgumentId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
   Data,
   statement::Tile<ArgumentId,
-                  RAJA::statement::tile_fixed<chunk_size>,
+                  RAJA::tile_fixed<chunk_size>,
                   hip_thread_xyz_direct<ThreadDim>,
-                  EnclosedStmts ...> >{
+                  EnclosedStmts ...>, Types>{
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
-  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t>;
+  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
   static
   inline
@@ -239,13 +342,19 @@ struct HipStatementExecutor<
     segment_t orig_segment = segment;
 
     // compute trip count
-    auto i0 = get_hip_dim<ThreadDim>(dim3(threadIdx.x,threadIdx.y,threadIdx.z)) * chunk_size;
+    diff_t len = segment.end() - segment.begin();
+    diff_t i = get_hip_dim<ThreadDim>(dim3(threadIdx.x,threadIdx.y,threadIdx.z)) * chunk_size;
+
+    // execute enclosed statements if any thread will
+    // but mask off threads without work
+    bool have_work = i < len;
 
     // Assign our new tiled segment
-    segment = orig_segment.slice(i0, chunk_size);
+    diff_t slice_size = have_work ? chunk_size : 0;
+    segment = orig_segment.slice(i, slice_size);
 
     // execute enclosed statements
-    enclosed_stmts_t::exec(data, thread_active);
+    enclosed_stmts_t::exec(data, thread_active && have_work);
 
     // Set range back to original values
     segment = orig_segment;
@@ -258,15 +367,15 @@ struct HipStatementExecutor<
   {
 
     // Compute how many blocks
-    int len = segment_length<ArgumentId>(data);
-    int num_threads = len / chunk_size;
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t num_threads = len / chunk_size;
     if(num_threads * chunk_size < len){
       num_threads++;
     }
 
     LaunchDims dims;
     set_hip_dim<ThreadDim>(dims.threads, num_threads);
-
+    set_hip_dim<ThreadDim>(dims.min_threads, num_threads);
 
     // privatize data, so we can mess with the segments
     using data_t = camp::decay<Data>;
@@ -297,17 +406,20 @@ template <typename Data,
           camp::idx_t chunk_size,
           int ThreadDim,
           int MinThreads,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
   Data,
   statement::Tile<ArgumentId,
-                  RAJA::statement::tile_fixed<chunk_size>,
+                  RAJA::tile_fixed<chunk_size>,
                   hip_thread_xyz_loop<ThreadDim, MinThreads>,
-                  EnclosedStmts ...> >{
+                  EnclosedStmts ...>, Types>{
 
   using stmt_list_t = StatementList<EnclosedStmts ...>;
 
-  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t>;
+  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
+
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
   static
   inline
@@ -322,20 +434,24 @@ struct HipStatementExecutor<
     segment_t orig_segment = segment;
 
     // compute trip count
-    auto i0 = get_hip_dim<ThreadDim>(dim3(threadIdx.x,threadIdx.y,threadIdx.z)) * chunk_size;
-
-    // Get our stride from the dimension
-    auto i_stride = get_hip_dim<ThreadDim>(dim3(blockDim.x,blockDim.y,blockDim.z)) * chunk_size;
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = get_hip_dim<ThreadDim>(dim3(threadIdx.x,threadIdx.y,threadIdx.z)) * chunk_size;
+    diff_t i_stride = get_hip_dim<ThreadDim>(dim3(blockDim.x,blockDim.y,blockDim.z)) * chunk_size;
 
     // Iterate through grid stride of chunks
-    int len = segment_length<ArgumentId>(data);
-    for (int i = i0; i < len; i += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
 
       // Assign our new tiled segment
-      segment = orig_segment.slice(i, chunk_size);
+      diff_t slice_size = have_work ? chunk_size : 0;
+      segment = orig_segment.slice(i, slice_size);
 
       // execute enclosed statements
-      enclosed_stmts_t::exec(data, thread_active);
+      enclosed_stmts_t::exec(data, thread_active && have_work);
     }
 
     // Set range back to original values
@@ -349,12 +465,12 @@ struct HipStatementExecutor<
   {
 
     // Compute how many blocks
-    int len = segment_length<ArgumentId>(data);
-    int num_threads = len / chunk_size;
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t num_threads = len / chunk_size;
     if(num_threads * chunk_size < len){
       num_threads++;
     }
-    num_threads = std::max(num_threads, MinThreads);
+    num_threads = std::max(num_threads, (diff_t)MinThreads);
 
     LaunchDims dims;
     set_hip_dim<ThreadDim>(dims.threads, num_threads);
@@ -385,4 +501,4 @@ struct HipStatementExecutor<
 }  // end namespace RAJA
 
 #endif  // RAJA_ENABLE_HIP
-#endif  /* RAJA_pattern_kernel_HPP */
+#endif  /* RAJA_policy_hip_kernel_Tile_HPP */
diff --git a/include/RAJA/policy/hip/kernel/TileTCount.hpp b/include/RAJA/policy/hip/kernel/TileTCount.hpp
index f16ede80a5..629507c9ae 100644
--- a/include/RAJA/policy/hip/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/hip/kernel/TileTCount.hpp
@@ -51,19 +51,21 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename TPol,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>>
+    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
     : public HipStatementExecutor<
         Data,
-        statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>> {
+        statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types> {
 
   using Base = HipStatementExecutor<
       Data,
-      statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>>;
+      statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>;
 
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
   static
   inline
@@ -76,13 +78,13 @@ struct HipStatementExecutor<
     using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
-    int chunk_size = TPol::chunk_size;
+    diff_t chunk_size = TPol::chunk_size;
 
     // compute trip count
-    int len = segment.end() - segment.begin();
+    diff_t len = segment.end() - segment.begin();
 
     // Iterate through tiles
-    for (int i = 0, t = 0; i < len; i += chunk_size, ++t) {
+    for (diff_t i = 0, t = 0; i < len; i += chunk_size, ++t) {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -108,28 +110,105 @@ template <typename Data,
           typename ParamId,
           camp::idx_t chunk_size,
           int BlockDim,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::statement::tile_fixed<chunk_size>,
+                    RAJA::tile_fixed<chunk_size>,
+                    hip_block_xyz_direct<BlockDim>,
+                    EnclosedStmts...>,
+                    Types>
+    : public HipStatementExecutor<
+        Data,
+        statement::Tile<ArgumentId,
+                        RAJA::tile_fixed<chunk_size>,
+                        hip_block_xyz_direct<BlockDim>,
+                        EnclosedStmts...>,
+                        Types> {
+
+  using Base = HipStatementExecutor<
+      Data,
+      statement::Tile<ArgumentId,
+                      RAJA::tile_fixed<chunk_size>,
+                      hip_block_xyz_direct<BlockDim>,
+                      EnclosedStmts...>,
+                      Types>;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
+  {
+    // Get the segment referenced by this Tile statement
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+
+    using segment_t = camp::decay<decltype(segment)>;
+
+    // compute trip count
+    diff_t len = segment.end() - segment.begin();
+    diff_t t = get_hip_dim<BlockDim>(dim3(blockIdx.x,blockIdx.y,blockIdx.z));
+    diff_t i = t * chunk_size;
+
+    // check have a chunk
+    if (i < len) {
+
+      // Keep copy of original segment, so we can restore it
+      segment_t orig_segment = segment;
+
+      // Assign our new tiled segment
+      segment = orig_segment.slice(i, chunk_size);
+      data.template assign_param<ParamId>(t);
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, thread_active);
+
+      // Set range back to original values
+      segment = orig_segment;
+    }
+  }
+};
+
+/*!
+ * A specialized RAJA::kernel hip_impl executor for statement::TileTCount
+ * Assigns the tile segment to segment ArgumentId
+ * Assigns the tile index to param ParamId
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          int BlockDim,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<
+    Data,
+    statement::TileTCount<ArgumentId, ParamId,
+                    RAJA::tile_fixed<chunk_size>,
                     hip_block_xyz_loop<BlockDim>,
-                    EnclosedStmts...>>
+                    EnclosedStmts...>,
+                    Types>
     : public HipStatementExecutor<
         Data,
         statement::Tile<ArgumentId,
-                        RAJA::statement::tile_fixed<chunk_size>,
+                        RAJA::tile_fixed<chunk_size>,
                         hip_block_xyz_loop<BlockDim>,
-                        EnclosedStmts...>> {
+                        EnclosedStmts...>,
+                        Types> {
 
   using Base = HipStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
-                      RAJA::statement::tile_fixed<chunk_size>,
+                      RAJA::tile_fixed<chunk_size>,
                       hip_block_xyz_loop<BlockDim>,
-                      EnclosedStmts...>>;
+                      EnclosedStmts...>,
+                      Types>;
 
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
   static
   inline
@@ -144,14 +223,14 @@ struct HipStatementExecutor<
     segment_t orig_segment = segment;
 
     // compute trip count
-    int len = segment.end() - segment.begin();
-    auto t0 = get_hip_dim<BlockDim>(dim3(blockIdx.x,blockIdx.y,blockIdx.z));
-    auto t_stride = get_hip_dim<BlockDim>(dim3(gridDim.x,gridDim.y,gridDim.z));
-    auto i0 = t0 * chunk_size;
-    auto i_stride = t_stride * chunk_size;
+    diff_t len = segment.end() - segment.begin();
+    diff_t t_init = get_hip_dim<BlockDim>(dim3(blockIdx.x,blockIdx.y,blockIdx.z));
+    diff_t i_init = t_init * chunk_size;
+    diff_t t_stride = get_hip_dim<BlockDim>(dim3(gridDim.x,gridDim.y,gridDim.z));
+    diff_t i_stride = t_stride * chunk_size;
 
     // Iterate through grid stride of chunks
-    for (int i = i0, t = t0; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -178,28 +257,33 @@ template <typename Data,
           typename ParamId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts>
+          typename ... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
   Data,
   statement::TileTCount<ArgumentId, ParamId,
-                        RAJA::statement::tile_fixed<chunk_size>,
+                        RAJA::tile_fixed<chunk_size>,
                         hip_thread_xyz_direct<ThreadDim>,
-                        EnclosedStmts ...> >
+                        EnclosedStmts ...>,
+                        Types>
   : public HipStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
-                    RAJA::statement::tile_fixed<chunk_size>,
+                    RAJA::tile_fixed<chunk_size>,
                     hip_thread_xyz_direct<ThreadDim>,
-                    EnclosedStmts ...> > {
+                    EnclosedStmts ...>,
+                    Types> {
 
   using Base = HipStatementExecutor<
           Data,
           statement::Tile<ArgumentId,
-                          RAJA::statement::tile_fixed<chunk_size>,
+                          RAJA::tile_fixed<chunk_size>,
                           hip_thread_xyz_direct<ThreadDim>,
-                          EnclosedStmts ...> >;
+                          EnclosedStmts ...>,
+                          Types>;
 
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
   static
   inline
@@ -214,16 +298,102 @@ struct HipStatementExecutor<
     segment_t orig_segment = segment;
 
     // compute trip count
-    auto t0 = get_hip_dim<ThreadDim>(dim3(threadIdx.x,threadIdx.y,threadIdx.z));
-    auto t_stride = get_hip_dim<ThreadDim>(dim3(blockDim.x,blockDim.y,blockDim.z));
-    auto i0 = t0 * chunk_size;
+    diff_t len = segment.end() - segment.begin();
+    diff_t t = get_hip_dim<ThreadDim>(dim3(threadIdx.x,threadIdx.y,threadIdx.z));
+    diff_t i = t * chunk_size;
+
+    // execute enclosed statements if any thread will
+    // but mask off threads without work
+    bool have_work = i < len;
 
     // Assign our new tiled segment
-    segment = orig_segment.slice(i0, chunk_size);
-    data.template assign_param<ParamId>(t0);
+    diff_t slice_size = have_work ? chunk_size : 0;
+    segment = orig_segment.slice(i, slice_size);
+    data.template assign_param<ParamId>(t);
 
     // execute enclosed statements
-    enclosed_stmts_t::exec(data, thread_active);
+    enclosed_stmts_t::exec(data, thread_active && have_work);
+
+    // Set range back to original values
+    segment = orig_segment;
+  }
+};
+
+
+/*!
+ * A specialized RAJA::kernel hip_impl executor for statement::TileTCount
+ * Assigns the tile segment to segment ArgumentId
+ * Assigns the tile index to param ParamId
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          int ThreadDim,
+          int MinThreads,
+          typename ... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<
+  Data,
+  statement::TileTCount<ArgumentId, ParamId,
+                        RAJA::tile_fixed<chunk_size>,
+                        hip_thread_xyz_loop<ThreadDim, MinThreads>,
+                        EnclosedStmts ...>,
+                        Types>
+  : public HipStatementExecutor<
+    Data,
+    statement::Tile<ArgumentId,
+                    RAJA::tile_fixed<chunk_size>,
+                    hip_thread_xyz_loop<ThreadDim, MinThreads>,
+                    EnclosedStmts ...>,
+                    Types> {
+
+  using Base = HipStatementExecutor<
+          Data,
+          statement::Tile<ArgumentId,
+                          RAJA::tile_fixed<chunk_size>,
+                          hip_thread_xyz_loop<ThreadDim, MinThreads>,
+                          EnclosedStmts ...>,
+                          Types>;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
+  {
+    // Get the segment referenced by this Tile statement
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+
+    // Keep copy of original segment, so we can restore it
+    using segment_t = camp::decay<decltype(segment)>;
+    segment_t orig_segment = segment;
+
+    // compute trip count
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t t_init = get_hip_dim<ThreadDim>(dim3(threadIdx.x,threadIdx.y,threadIdx.z));
+    diff_t i_init = t_init * chunk_size;
+    diff_t t_stride = get_hip_dim<ThreadDim>(dim3(blockDim.x,blockDim.y,blockDim.z));
+    diff_t i_stride = t_stride * chunk_size;
+
+    // Iterate through grid stride of chunks
+    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
+
+      // Assign our new tiled segment
+      diff_t slice_size = have_work ? chunk_size : 0;
+      segment = orig_segment.slice(i, slice_size);
+      data.template assign_param<ParamId>(t);
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, thread_active && have_work);
+    }
 
     // Set range back to original values
     segment = orig_segment;
@@ -234,4 +404,4 @@ struct HipStatementExecutor<
 }  // end namespace RAJA
 
 #endif  // RAJA_ENABLE_HIP
-#endif  /* RAJA_pattern_kernel_HPP */
+#endif  /* RAJA_policy_hip_kernel_TileTCount_HPP */
diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp
index 167a244f07..e00e9bc29b 100644
--- a/include/RAJA/policy/hip/kernel/internal.hpp
+++ b/include/RAJA/policy/hip/kernel/internal.hpp
@@ -37,8 +37,6 @@
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
 #include "RAJA/policy/hip/policy.hpp"
 
-#include "RAJA/internal/LegacyCompatibility.hpp"
-
 
 namespace RAJA
 {
@@ -369,18 +367,18 @@ struct HipStatementListExecutorHelper<num_stmts, num_stmts, StmtList> {
 };
 
 
-template <typename Data, typename Policy>
+template <typename Data, typename Policy, typename Types>
 struct HipStatementExecutor;
 
-template <typename Data, typename StmtList>
+template <typename Data, typename StmtList, typename Types>
 struct HipStatementListExecutor;
 
 
-template <typename Data, typename... Stmts>
-struct HipStatementListExecutor<Data, StatementList<Stmts...>> {
+template <typename Data, typename... Stmts, typename Types>
+struct HipStatementListExecutor<Data, StatementList<Stmts...>, Types> {
 
   using enclosed_stmts_t =
-      camp::list<HipStatementExecutor<Data, Stmts>...>;
+      camp::list<HipStatementExecutor<Data, Stmts, Types>...>;
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
@@ -406,10 +404,11 @@ struct HipStatementListExecutor<Data, StatementList<Stmts...>> {
 };
 
 
-template <typename StmtList, typename Data>
+template <typename StmtList, typename Data, typename Types>
 using hip_statement_list_executor_t = HipStatementListExecutor<
     Data,
-    StmtList>;
+    StmtList,
+    Types>;
 
 
 
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index dd575c2ed5..7cdac16730 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -23,6 +23,7 @@
 #if defined(RAJA_ENABLE_HIP)
 
 #include <utility>
+#include "hip/hip_runtime.h"
 
 #include "RAJA/pattern/reduce.hpp"
 
@@ -82,6 +83,18 @@ struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
 // NOTE: There is no Index set segment iteration policy for HIP
 //
 
+///
+/// WorkGroup execution policies
+///
+template <size_t BLOCK_SIZE, bool Async = false>
+struct hip_work : public RAJA::make_policy_pattern_launch_platform_t<
+                       RAJA::Policy::hip,
+                       RAJA::Pattern::workgroup_exec,
+                       detail::get_launch<Async>::value,
+                       RAJA::Platform::hip> {
+};
+
+
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
@@ -157,9 +170,9 @@ struct hip_thread_masked_loop {};
 // Operations in the included files are parametrized using the following
 // values for HIP warp size and max block size.
 //
-#if defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_HCC__)
 constexpr const RAJA::Index_type WARP_SIZE = 64;
-#elif defined(__CUDACC__)
+#elif defined(__HIP_PLATFORM_NVCC__)
 constexpr const RAJA::Index_type WARP_SIZE = 32;
 #endif
 
@@ -184,6 +197,11 @@ using policy::hip::hip_exec;
 template <size_t BLOCK_SIZE>
 using hip_exec_async = policy::hip::hip_exec<BLOCK_SIZE, true>;
 
+using policy::hip::hip_work;
+
+template <size_t BLOCK_SIZE>
+using hip_work_async = policy::hip::hip_work<BLOCK_SIZE, true>;
+
 using policy::hip::hip_reduce_base;
 using policy::hip::hip_reduce;
 using policy::hip::hip_reduce_atomic;
@@ -232,6 +250,19 @@ using hip_thread_y_loop = hip_thread_xyz_loop<1, 1>;
 using hip_thread_z_loop = hip_thread_xyz_loop<2, 1>;
 
 
+/*!
+ * Maps segment indices to CUDA blocks.
+ * This is the lowest overhead mapping, but requires that there are enough
+ * physical blocks to fit all of the direct map requests.
+ */
+template<int dim>
+struct hip_block_xyz_direct{};
+
+using hip_block_x_direct = hip_block_xyz_direct<0>;
+using hip_block_y_direct = hip_block_xyz_direct<1>;
+using hip_block_z_direct = hip_block_xyz_direct<2>;
+
+
 /*!
  * Maps segment indices to HIP blocks.
  * Uses grid-stride looping to exceed the maximum number of blocks
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index adfa5e5867..bea8d74074 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -29,6 +29,7 @@
 
 #include <hip/hip_runtime.h>
 
+#include "RAJA/util/macros.hpp"
 #include "RAJA/util/SoAArray.hpp"
 #include "RAJA/util/SoAPtr.hpp"
 #include "RAJA/util/basic_mempool.hpp"
@@ -289,7 +290,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   if (numThreads > policy::hip::WARP_SIZE) {
 
     __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::hip::MAX_WARPS>)];
-    RAJA::detail::SoAArray<T, policy::hip::MAX_WARPS>* sd = 
+    RAJA::detail::SoAArray<T, policy::hip::MAX_WARPS>* sd =
       reinterpret_cast<RAJA::detail::SoAArray<T, policy::hip::MAX_WARPS> *>(tmpsd);
 
     // write per warp values to shared memory
@@ -816,7 +817,7 @@ class Reduce
   //  reducer in host device lambda not being used on device.
   RAJA_HOST_DEVICE
   Reduce(const Reduce& other)
-#if !defined(__HIP_DEVICE_COMPILE__)
+#if !defined(RAJA_DEVICE_CODE)
       : parent{other.parent},
 #else
       : parent{&other},
@@ -824,7 +825,7 @@ class Reduce
         tally_or_val_ptr{other.tally_or_val_ptr},
         val(other.val)
   {
-#if !defined(__HIP_DEVICE_COMPILE__)
+#if !defined(RAJA_DEVICE_CODE)
     if (parent) {
       if (val.setupForDevice()) {
         tally_or_val_ptr.val_ptr =
@@ -841,7 +842,7 @@ class Reduce
   RAJA_HOST_DEVICE
   ~Reduce()
   {
-#if !defined(__HIP_DEVICE_COMPILE__)
+#if !defined(RAJA_DEVICE_CODE)
     if (parent == this) {
       delete tally_or_val_ptr.list;
       tally_or_val_ptr.list = nullptr;
@@ -938,6 +939,42 @@ class ReduceSum<hip_reduce_base<maybe_atomic>, T>
   }
 };
 
+//! specialization of ReduceBitOr for hip_reduce
+template <bool maybe_atomic, typename T>
+class ReduceBitOr<hip_reduce_base<maybe_atomic>, T>
+    : public hip::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic>
+{
+
+public:
+  using Base = hip::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic>;
+  using Base::Base;
+  //! enable operator|= for ReduceOr -- alias for combine()
+  RAJA_HOST_DEVICE
+  const ReduceBitOr& operator|=(T rhs) const
+  {
+    this->combine(rhs);
+    return *this;
+  }
+};
+
+//! specialization of ReduceBitAnd for hip_reduce
+template <bool maybe_atomic, typename T>
+class ReduceBitAnd<hip_reduce_base<maybe_atomic>, T>
+    : public hip::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic>
+{
+
+public:
+  using Base = hip::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic>;
+  using Base::Base;
+  //! enable operator&= for ReduceBitAnd -- alias for combine()
+  RAJA_HOST_DEVICE
+  const ReduceBitAnd& operator&=(T rhs) const
+  {
+    this->combine(rhs);
+    return *this;
+  }
+};
+
 //! specialization of ReduceMin for hip_reduce
 template <bool maybe_atomic, typename T>
 class ReduceMin<hip_reduce_base<maybe_atomic>, T>
diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp
new file mode 100644
index 0000000000..c9fd163dbb
--- /dev/null
+++ b/include/RAJA/policy/hip/sort.hpp
@@ -0,0 +1,624 @@
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file providing RAJA sort declarations.
+*
+******************************************************************************
+*/
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_sort_hip_HPP
+#define RAJA_sort_hip_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include <climits>
+#include <iterator>
+#include <type_traits>
+
+#if defined(__HIPCC__)
+#define ROCPRIM_HIP_API 1
+#include "rocprim/device/device_transform.hpp"
+#include "rocprim/device/device_radix_sort.hpp"
+#elif defined(__CUDACC__)
+#include "cub/device/device_radix_sort.cuh"
+#endif
+
+#include "RAJA/util/concepts.hpp"
+#include "RAJA/util/Operators.hpp"
+#include "RAJA/pattern/detail/algorithm.hpp"
+#include "RAJA/policy/hip/MemUtils_HIP.hpp"
+#include "RAJA/policy/hip/policy.hpp"
+
+namespace RAJA
+{
+namespace impl
+{
+namespace sort
+{
+
+namespace detail
+{
+
+#if defined(__HIPCC__)
+  template < typename R >
+  using double_buffer = ::rocprim::double_buffer<R>;
+#elif defined(__CUDACC__)
+  template < typename R >
+  using double_buffer = ::cub::DoubleBuffer<R>;
+#endif
+
+  template < typename R >
+  R* get_current(double_buffer<R>& d_bufs)
+  {
+#if defined(__HIPCC__)
+    return d_bufs.current();
+#elif defined(__CUDACC__)
+    return d_bufs.Current();
+#endif
+  }
+
+}
+
+/*!
+        \brief static assert unimplemented stable sort
+*/
+template <size_t BLOCK_SIZE, bool Async, typename Iter, typename Compare>
+concepts::enable_if<concepts::negate<concepts::all_of<
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                      std::is_pointer<Iter>,
+                      concepts::any_of<
+                        camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
+                        camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+stable(const ::RAJA::hip_exec<BLOCK_SIZE, Async>&,
+     Iter,
+     Iter,
+     Compare)
+{
+  static_assert(concepts::all_of<
+                  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                  std::is_pointer<Iter>,
+                  concepts::any_of<
+                    camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
+                    camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>::value,
+                "RAJA stable_sort<hip_exec> is only implemented for pointers to arithmetic types and RAJA::operators::less and RAJA::operators::greater.");
+}
+
+/*!
+        \brief stable sort given range in ascending order
+*/
+template <size_t BLOCK_SIZE, bool Async, typename Iter>
+concepts::enable_if<type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                    std::is_pointer<Iter>>
+stable(const ::RAJA::hip_exec<BLOCK_SIZE, Async>&,
+     Iter begin,
+     Iter end,
+     operators::less<RAJA::detail::IterVal<Iter>>)
+{
+  hipStream_t stream = 0;
+
+  using R = RAJA::detail::IterVal<Iter>;
+
+  int len = std::distance(begin, end);
+  int begin_bit=0;
+  int end_bit=sizeof(R)*CHAR_BIT;
+
+  // Allocate temporary storage for the output array
+  R* d_out = hip::device_mempool_type::getInstance().malloc<R>(len);
+
+  // use cub double buffer to reduce temporary memory requirements
+  // by allowing cub to write to the begin buffer
+  detail::double_buffer<R> d_keys(begin, d_out);
+
+  // Determine temporary device storage requirements
+  void* d_temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+#if defined(__HIPCC__)
+  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage,
+                                       temp_storage_bytes,
+                                       d_keys,
+                                       len,
+                                       begin_bit,
+                                       end_bit,
+                                       stream));
+#elif defined(__CUDACC__)
+  cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                              temp_storage_bytes,
+                                              d_keys,
+                                              len,
+                                              begin_bit,
+                                              end_bit,
+                                              stream));
+#endif
+  // Allocate temporary storage
+  d_temp_storage =
+      hip::device_mempool_type::getInstance().malloc<unsigned char>(
+          temp_storage_bytes);
+
+  // Run
+#if defined(__HIPCC__)
+  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage,
+                                       temp_storage_bytes,
+                                       d_keys,
+                                       len,
+                                       begin_bit,
+                                       end_bit,
+                                       stream));
+#elif defined(__CUDACC__)
+  cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                              temp_storage_bytes,
+                                              d_keys,
+                                              len,
+                                              begin_bit,
+                                              end_bit,
+                                              stream));
+#endif
+  // Free temporary storage
+  hip::device_mempool_type::getInstance().free(d_temp_storage);
+
+  if (detail::get_current(d_keys) == d_out) {
+
+    // copy
+    hipErrchk(hipMemcpyAsync(begin, d_out, len*sizeof(R), hipMemcpyDefault, stream));
+  }
+
+  hip::device_mempool_type::getInstance().free(d_out);
+
+  hip::launch(stream);
+  if (!Async) hip::synchronize(stream);
+}
+
+/*!
+        \brief stable sort given range in descending order
+*/
+template <size_t BLOCK_SIZE, bool Async, typename Iter>
+concepts::enable_if<type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                    std::is_pointer<Iter>>
+stable(const ::RAJA::hip_exec<BLOCK_SIZE, Async>&,
+     Iter begin,
+     Iter end,
+     operators::greater<RAJA::detail::IterVal<Iter>>)
+{
+  hipStream_t stream = 0;
+
+  using R = RAJA::detail::IterVal<Iter>;
+
+  int len = std::distance(begin, end);
+  int begin_bit=0;
+  int end_bit=sizeof(R)*CHAR_BIT;
+
+  // Allocate temporary storage for the output array
+  R* d_out = hip::device_mempool_type::getInstance().malloc<R>(len);
+
+  // use cub double buffer to reduce temporary memory requirements
+  // by allowing cub to write to the begin buffer
+  detail::double_buffer<R> d_keys(begin, d_out);
+
+  // Determine temporary device storage requirements
+  void* d_temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+#if defined(__HIPCC__)
+  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage,
+                                            temp_storage_bytes,
+                                            d_keys,
+                                            len,
+                                            begin_bit,
+                                            end_bit,
+                                            stream));
+#elif defined(__CUDACC__)
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                                        temp_storage_bytes,
+                                                        d_keys,
+                                                        len,
+                                                        begin_bit,
+                                                        end_bit,
+                                                        stream));
+#endif
+  // Allocate temporary storage
+  d_temp_storage =
+      hip::device_mempool_type::getInstance().malloc<unsigned char>(
+          temp_storage_bytes);
+
+  // Run
+#if defined(__HIPCC__)
+  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage,
+                                            temp_storage_bytes,
+                                            d_keys,
+                                            len,
+                                            begin_bit,
+                                            end_bit,
+                                            stream));
+#elif defined(__CUDACC__)
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                                        temp_storage_bytes,
+                                                        d_keys,
+                                                        len,
+                                                        begin_bit,
+                                                        end_bit,
+                                                        stream));
+#endif
+  // Free temporary storage
+  hip::device_mempool_type::getInstance().free(d_temp_storage);
+
+  if (detail::get_current(d_keys) == d_out) {
+
+    // copy
+    hipErrchk(hipMemcpyAsync(begin, d_out, len*sizeof(R), hipMemcpyDefault, stream));
+  }
+
+  hip::device_mempool_type::getInstance().free(d_out);
+
+  hip::launch(stream);
+  if (!Async) hip::synchronize(stream);
+}
+
+
+/*!
+        \brief static assert unimplemented sort
+*/
+template <size_t BLOCK_SIZE, bool Async, typename Iter, typename Compare>
+concepts::enable_if<concepts::negate<concepts::all_of<
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                      std::is_pointer<Iter>,
+                      concepts::any_of<
+                        camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
+                        camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+unstable(const ::RAJA::hip_exec<BLOCK_SIZE, Async>&,
+         Iter,
+         Iter,
+         Compare)
+{
+  static_assert(concepts::all_of<
+                  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                  std::is_pointer<Iter>,
+                  concepts::any_of<
+                    camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
+                    camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>::value,
+                "RAJA sort<hip_exec> is only implemented for pointers to arithmetic types and RAJA::operators::less and RAJA::operators::greater.");
+}
+
+/*!
+        \brief sort given range in ascending order
+*/
+template <size_t BLOCK_SIZE, bool Async, typename Iter>
+concepts::enable_if<type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                    std::is_pointer<Iter>>
+unstable(const ::RAJA::hip_exec<BLOCK_SIZE, Async>& p,
+     Iter begin,
+     Iter end,
+     operators::less<RAJA::detail::IterVal<Iter>> comp)
+{
+  stable(p, begin, end, comp);
+}
+
+/*!
+        \brief sort given range in descending order
+*/
+template <size_t BLOCK_SIZE, bool Async, typename Iter>
+concepts::enable_if<type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                    std::is_pointer<Iter>>
+unstable(const ::RAJA::hip_exec<BLOCK_SIZE, Async>& p,
+     Iter begin,
+     Iter end,
+     operators::greater<RAJA::detail::IterVal<Iter>> comp)
+{
+  stable(p, begin, end, comp);
+}
+
+
+/*!
+        \brief static assert unimplemented stable sort pairs
+*/
+template <size_t BLOCK_SIZE, bool Async,
+          typename KeyIter, typename ValIter, typename Compare>
+concepts::enable_if<concepts::negate<concepts::all_of<
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                      std::is_pointer<KeyIter>,
+                      std::is_pointer<ValIter>,
+                      concepts::any_of<
+                        camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
+                        camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+stable_pairs(const ::RAJA::hip_exec<BLOCK_SIZE, Async>&,
+             KeyIter,
+             KeyIter,
+             ValIter,
+             Compare)
+{
+  static_assert (std::is_pointer<KeyIter>::value,
+      "stable_sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert (std::is_pointer<ValIter>::value,
+      "stable_sort_pairs<hip_exec> is only implemented for pointers");
+  using K = RAJA::detail::IterVal<KeyIter>;
+  static_assert (type_traits::is_arithmetic<K>::value,
+      "stable_sort_pairs<hip_exec> is only implemented for arithmetic types");
+  static_assert (concepts::any_of<
+      camp::is_same<Compare, operators::less<K>>,
+      camp::is_same<Compare, operators::greater<K>>>::value,
+      "stable_sort_pairs<hip_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+}
+
+/*!
+        \brief stable sort given range of pairs in ascending order of keys
+*/
+template <size_t BLOCK_SIZE, bool Async,
+          typename KeyIter, typename ValIter>
+concepts::enable_if<type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                    std::is_pointer<KeyIter>,
+                    std::is_pointer<ValIter>>
+stable_pairs(const ::RAJA::hip_exec<BLOCK_SIZE, Async>&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             operators::less<RAJA::detail::IterVal<KeyIter>>)
+{
+  hipStream_t stream = 0;
+
+  using K = RAJA::detail::IterVal<KeyIter>;
+  using V = RAJA::detail::IterVal<ValIter>;
+
+  int len = std::distance(keys_begin, keys_end);
+  int begin_bit=0;
+  int end_bit=sizeof(K)*CHAR_BIT;
+
+  // Allocate temporary storage for the output arrays
+  K* d_keys_out = hip::device_mempool_type::getInstance().malloc<K>(len);
+  V* d_vals_out = hip::device_mempool_type::getInstance().malloc<V>(len);
+
+  // use cub double buffer to reduce temporary memory requirements
+  // by allowing cub to write to the keys_begin and vals_begin buffers
+  detail::double_buffer<K> d_keys(keys_begin, d_keys_out);
+  detail::double_buffer<V> d_vals(vals_begin, d_vals_out);
+
+  // Determine temporary device storage requirements
+  void* d_temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+#if defined(__HIPCC__)
+  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage,
+                                        temp_storage_bytes,
+                                        d_keys,
+                                        d_vals,
+                                        len,
+                                        begin_bit,
+                                        end_bit,
+                                        stream));
+#elif defined(__CUDACC__)
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                               temp_storage_bytes,
+                                               d_keys,
+                                               d_vals,
+                                               len,
+                                               begin_bit,
+                                               end_bit,
+                                               stream));
+#endif
+  // Allocate temporary storage
+  d_temp_storage =
+      hip::device_mempool_type::getInstance().malloc<unsigned char>(
+          temp_storage_bytes);
+
+  // Run
+#if defined(__HIPCC__)
+  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage,
+                                        temp_storage_bytes,
+                                        d_keys,
+                                        d_vals,
+                                        len,
+                                        begin_bit,
+                                        end_bit,
+                                        stream));
+#elif defined(__CUDACC__)
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                               temp_storage_bytes,
+                                               d_keys,
+                                               d_vals,
+                                               len,
+                                               begin_bit,
+                                               end_bit,
+                                               stream));
+#endif
+  // Free temporary storage
+  hip::device_mempool_type::getInstance().free(d_temp_storage);
+
+  if (detail::get_current(d_keys) == d_keys_out) {
+
+    // copy keys
+    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), hipMemcpyDefault, stream));
+  }
+  if (detail::get_current(d_vals) == d_vals_out) {
+
+    // copy vals
+    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), hipMemcpyDefault, stream));
+  }
+
+  hip::device_mempool_type::getInstance().free(d_keys_out);
+  hip::device_mempool_type::getInstance().free(d_vals_out);
+
+  hip::launch(stream);
+  if (!Async) hip::synchronize(stream);
+}
+
+/*!
+        \brief stable sort given range of pairs in descending order of keys
+*/
+template <size_t BLOCK_SIZE, bool Async,
+          typename KeyIter, typename ValIter>
+concepts::enable_if<type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                    std::is_pointer<KeyIter>,
+                    std::is_pointer<ValIter>>
+stable_pairs(const ::RAJA::hip_exec<BLOCK_SIZE, Async>&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             operators::greater<RAJA::detail::IterVal<KeyIter>>)
+{
+  hipStream_t stream = 0;
+
+  using K = RAJA::detail::IterVal<KeyIter>;
+  using V = RAJA::detail::IterVal<ValIter>;
+
+  int len = std::distance(keys_begin, keys_end);
+  int begin_bit=0;
+  int end_bit=sizeof(K)*CHAR_BIT;
+
+  // Allocate temporary storage for the output arrays
+  K* d_keys_out = hip::device_mempool_type::getInstance().malloc<K>(len);
+  V* d_vals_out = hip::device_mempool_type::getInstance().malloc<V>(len);
+
+  // use cub double buffer to reduce temporary memory requirements
+  // by allowing cub to write to the keys_begin and vals_begin buffers
+  detail::double_buffer<K> d_keys(keys_begin, d_keys_out);
+  detail::double_buffer<V> d_vals(vals_begin, d_vals_out);
+
+  // Determine temporary device storage requirements
+  void* d_temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+#if defined(__HIPCC__)
+  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage,
+                                             temp_storage_bytes,
+                                             d_keys,
+                                             d_vals,
+                                             len,
+                                             begin_bit,
+                                             end_bit,
+                                             stream));
+#elif defined(__CUDACC__)
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                                         temp_storage_bytes,
+                                                         d_keys,
+                                                         d_vals,
+                                                         len,
+                                                         begin_bit,
+                                                         end_bit,
+                                                         stream));
+#endif
+  // Allocate temporary storage
+  d_temp_storage =
+      hip::device_mempool_type::getInstance().malloc<unsigned char>(
+          temp_storage_bytes);
+
+  // Run
+#if defined(__HIPCC__)
+  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage,
+                                             temp_storage_bytes,
+                                             d_keys,
+                                             d_vals,
+                                             len,
+                                             begin_bit,
+                                             end_bit,
+                                             stream));
+#elif defined(__CUDACC__)
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                                         temp_storage_bytes,
+                                                         d_keys,
+                                                         d_vals,
+                                                         len,
+                                                         begin_bit,
+                                                         end_bit,
+                                                         stream));
+#endif
+  // Free temporary storage
+  hip::device_mempool_type::getInstance().free(d_temp_storage);
+
+  if (detail::get_current(d_keys) == d_keys_out) {
+
+    // copy keys
+    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), hipMemcpyDefault, stream));
+  }
+  if (detail::get_current(d_vals) == d_vals_out) {
+
+    // copy vals
+    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), hipMemcpyDefault, stream));
+  }
+
+  hip::device_mempool_type::getInstance().free(d_keys_out);
+  hip::device_mempool_type::getInstance().free(d_vals_out);
+
+  hip::launch(stream);
+  if (!Async) hip::synchronize(stream);
+}
+
+
+/*!
+        \brief static assert unimplemented sort pairs
+*/
+template <size_t BLOCK_SIZE, bool Async,
+          typename KeyIter, typename ValIter, typename Compare>
+concepts::enable_if<concepts::negate<concepts::all_of<
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                      std::is_pointer<KeyIter>,
+                      std::is_pointer<ValIter>,
+                      concepts::any_of<
+                        camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
+                        camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+unstable_pairs(const ::RAJA::hip_exec<BLOCK_SIZE, Async>&,
+               KeyIter,
+               KeyIter,
+               ValIter,
+               Compare)
+{
+  static_assert (std::is_pointer<KeyIter>::value,
+      "sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert (std::is_pointer<ValIter>::value,
+      "sort_pairs<hip_exec> is only implemented for pointers");
+  using K = RAJA::detail::IterVal<KeyIter>;
+  static_assert (type_traits::is_arithmetic<K>::value,
+      "sort_pairs<hip_exec> is only implemented for arithmetic types");
+  static_assert (concepts::any_of<
+      camp::is_same<Compare, operators::less<K>>,
+      camp::is_same<Compare, operators::greater<K>>>::value,
+      "sort_pairs<hip_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+}
+
+/*!
+        \brief stable sort given range of pairs in ascending order of keys
+*/
+template <size_t BLOCK_SIZE, bool Async,
+          typename KeyIter, typename ValIter>
+concepts::enable_if<type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                    std::is_pointer<KeyIter>,
+                    std::is_pointer<ValIter>>
+unstable_pairs(const ::RAJA::hip_exec<BLOCK_SIZE, Async>& p,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               operators::less<RAJA::detail::IterVal<KeyIter>> comp)
+{
+  stable_pairs(p, keys_begin, keys_end, vals_begin, comp);
+}
+
+/*!
+        \brief stable sort given range of pairs in descending order of keys
+*/
+template <size_t BLOCK_SIZE, bool Async,
+          typename KeyIter, typename ValIter>
+concepts::enable_if<type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                    std::is_pointer<KeyIter>,
+                    std::is_pointer<ValIter>>
+unstable_pairs(const ::RAJA::hip_exec<BLOCK_SIZE, Async>& p,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
+{
+  stable_pairs(p, keys_begin, keys_end, vals_begin, comp);
+}
+
+}  // namespace sort
+
+}  // namespace impl
+
+}  // namespace RAJA
+
+#endif  // closing endif for RAJA_ENABLE_HIP guard
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/loop.hpp b/include/RAJA/policy/loop.hpp
index e7df291e21..075da1b893 100644
--- a/include/RAJA/policy/loop.hpp
+++ b/include/RAJA/policy/loop.hpp
@@ -25,5 +25,7 @@
 #include "RAJA/policy/loop/kernel.hpp"
 #include "RAJA/policy/loop/policy.hpp"
 #include "RAJA/policy/loop/scan.hpp"
+#include "RAJA/policy/loop/sort.hpp"
+#include "RAJA/policy/loop/WorkGroup.hpp"
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/loop/WorkGroup.hpp b/include/RAJA/policy/loop/WorkGroup.hpp
new file mode 100644
index 0000000000..99b14250f5
--- /dev/null
+++ b/include/RAJA/policy/loop/WorkGroup.hpp
@@ -0,0 +1,24 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA Vtable and WorkRunner constructs.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_loop_WorkGroup_HPP
+#define RAJA_loop_WorkGroup_HPP
+
+#include "RAJA/policy/loop/WorkGroup/Vtable.hpp"
+#include "RAJA/policy/loop/WorkGroup/WorkRunner.hpp"
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/loop/WorkGroup/Vtable.hpp b/include/RAJA/policy/loop/WorkGroup/Vtable.hpp
new file mode 100644
index 0000000000..fab15ac345
--- /dev/null
+++ b/include/RAJA/policy/loop/WorkGroup/Vtable.hpp
@@ -0,0 +1,53 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA workgroup Vtable.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_loop_WorkGroup_Vtable_HPP
+#define RAJA_loop_WorkGroup_Vtable_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/loop/policy.hpp"
+
+#include "RAJA/pattern/WorkGroup/Vtable.hpp"
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+ * Populate and return a Vtable object
+ */
+template < typename T, typename Vtable_T >
+inline const Vtable_T* get_Vtable(loop_work const&)
+{
+  static Vtable_T vtable{
+        &Vtable_T::template move_construct_destroy<T>,
+        &Vtable_T::template host_call<T>,
+        &Vtable_T::template destroy<T>,
+        sizeof(T)
+      };
+  return &vtable;
+}
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/loop/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/loop/WorkGroup/WorkRunner.hpp
new file mode 100644
index 0000000000..589cf712b2
--- /dev/null
+++ b/include/RAJA/policy/loop/WorkGroup/WorkRunner.hpp
@@ -0,0 +1,82 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA WorkRunner class specializations.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_loop_WorkGroup_WorkRunner_HPP
+#define RAJA_loop_WorkGroup_WorkRunner_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/loop/policy.hpp"
+
+#include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+ * Runs work in a storage container in order
+ * and returns any per run resources
+ */
+template <typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::loop_work,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallOrdered<
+        RAJA::loop_exec,
+        RAJA::loop_work,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{ };
+
+/*!
+ * Runs work in a storage container in reverse order
+ * and returns any per run resources
+ */
+template <typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::loop_work,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallReverse<
+        RAJA::loop_exec,
+        RAJA::loop_work,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{ };
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/loop/forall.hpp b/include/RAJA/policy/loop/forall.hpp
index 64c64989e2..42b2904ce4 100644
--- a/include/RAJA/policy/loop/forall.hpp
+++ b/include/RAJA/policy/loop/forall.hpp
@@ -51,14 +51,19 @@ namespace loop
 //////////////////////////////////////////////////////////////////////
 //
 
+
 template <typename Iterable, typename Func>
-RAJA_INLINE void forall_impl(const loop_exec &, Iterable &&iter, Func &&body)
+RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(RAJA::resources::Host & host_res,
+                                                    const loop_exec &,
+                                                    Iterable &&iter,
+                                                    Func &&body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 
   for (decltype(distance_it) i = 0; i < distance_it; ++i) {
     body(*(begin_it + i));
   }
+  return RAJA::resources::EventProxy<resources::Host>(&host_res);
 }
 
 }  // namespace loop
diff --git a/include/RAJA/policy/loop/kernel/Collapse.hpp b/include/RAJA/policy/loop/kernel/Collapse.hpp
index 8fa4f04a2f..3aa79caa2e 100644
--- a/include/RAJA/policy/loop/kernel/Collapse.hpp
+++ b/include/RAJA/policy/loop/kernel/Collapse.hpp
@@ -30,15 +30,15 @@ namespace internal
 //
 // Termination case for seq_exec collapsed loops
 //
-template <typename... EnclosedStmts>
+template <typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::Collapse<loop_exec, ArgList<>, EnclosedStmts...>> {
+    statement::Collapse<loop_exec, ArgList<>, EnclosedStmts...>, Types> {
 
   template <typename Data>
   static RAJA_INLINE void exec(Data &data)
   {
     // termination case: no more loops, just execute enclosed statements
-    execute_statement_list<camp::list<EnclosedStmts...>>(data);
+    execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
   }
 };
 
@@ -47,17 +47,20 @@ struct StatementExecutor<
 // Executor that handles collapsing of an arbitrarily deep set of seq_exec
 // loops
 //
-template <camp::idx_t Arg0, camp::idx_t... ArgRest, typename... EnclosedStmts>
+template <camp::idx_t Arg0, camp::idx_t... ArgRest, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<statement::Collapse<loop_exec,
                                              ArgList<Arg0, ArgRest...>,
-                                             EnclosedStmts...>> {
+                                             EnclosedStmts...>, Types> {
 
   template <typename Data>
   static RAJA_INLINE void exec(Data &data)
   {
+    // Set the argument type for this loop
+    using NewTypes = setSegmentTypeFromData<Types, Arg0, Data>;
+
     // compute next-most inner loop Executor
     using next_loop_t = StatementExecutor<
-        statement::Collapse<loop_exec, ArgList<ArgRest...>, EnclosedStmts...>>;
+        statement::Collapse<loop_exec, ArgList<ArgRest...>, EnclosedStmts...>, NewTypes>;
 
     auto len0 = segment_length<Arg0>(data);
 
diff --git a/include/RAJA/policy/loop/policy.hpp b/include/RAJA/policy/loop/policy.hpp
index 3ec4453e07..c639fecaf5 100644
--- a/include/RAJA/policy/loop/policy.hpp
+++ b/include/RAJA/policy/loop/policy.hpp
@@ -52,6 +52,15 @@ struct loop_exec : make_policy_pattern_launch_platform_t<Policy::loop,
 ///
 using loop_segit = loop_exec;
 
+///
+/// WorkGroup execution policies
+///
+struct loop_work : make_policy_pattern_launch_platform_t<Policy::loop,
+                                                         Pattern::workgroup_exec,
+                                                         Launch::sync,
+                                                         Platform::host> {
+};
+
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
@@ -68,6 +77,7 @@ using loop_reduce = seq_reduce;
 using policy::loop::loop_exec;
 using policy::loop::loop_reduce;
 using policy::loop::loop_segit;
+using policy::loop::loop_work;
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/loop/scan.hpp b/include/RAJA/policy/loop/scan.hpp
index 399ecc6ceb..066ba8ee32 100644
--- a/include/RAJA/policy/loop/scan.hpp
+++ b/include/RAJA/policy/loop/scan.hpp
@@ -67,12 +67,17 @@ concepts::enable_if<type_traits::is_loop_policy<ExecPolicy>> exclusive_inplace(
     BinFn f,
     T v)
 {
-  const int n = end - begin;
-  decltype(*begin) agg = v;
+  using std::distance;
+  const auto n = distance(begin, end);
 
-  for (int i = 0; i < n; ++i) {
-    auto t = *(begin + i);
-    *(begin + i) = agg;
+  using DistanceT = typename std::remove_const<decltype(n)>::type;
+  using ValueT = decltype(*begin);
+  
+  ValueT agg = v;
+
+  for (DistanceT i = 0; i < n; ++i) {
+    auto t = begin[i];
+    begin[i] = agg;
     agg = f(agg, t);
   }
 }
@@ -115,7 +120,7 @@ concepts::enable_if<type_traits::is_loop_policy<ExecPolicy>> exclusive(
     BinFn f,
     T v)
 {
-  decltype(*begin) agg = v;
+  typename std::remove_const< decltype(*begin) >::type agg = v;
   OutIter o = out;
   *o++ = v;
 
diff --git a/include/RAJA/policy/loop/sort.hpp b/include/RAJA/policy/loop/sort.hpp
new file mode 100644
index 0000000000..24cd696613
--- /dev/null
+++ b/include/RAJA/policy/loop/sort.hpp
@@ -0,0 +1,143 @@
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file providing RAJA sort declarations.
+*
+******************************************************************************
+*/
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_sort_loop_HPP
+#define RAJA_sort_loop_HPP
+
+#include "RAJA/config.hpp"
+
+#include <algorithm>
+#include <functional>
+#include <iterator>
+
+#include "RAJA/util/macros.hpp"
+
+#include "RAJA/util/concepts.hpp"
+
+#include "RAJA/util/zip.hpp"
+
+#include "RAJA/util/sort.hpp"
+
+#include "RAJA/policy/loop/policy.hpp"
+
+namespace RAJA
+{
+namespace impl
+{
+namespace sort
+{
+
+namespace detail
+{
+
+/*!
+    \brief Functional that performs an unstable sort with the
+           given arguments, uses RAJA::intro_sort
+*/
+struct UnstableSorter
+{
+  template < typename... Args >
+  RAJA_INLINE
+  void operator()(Args&&... args) const
+  {
+    RAJA::intro_sort(std::forward<Args>(args)...);
+  }
+};
+
+/*!
+    \brief Functional that performs a stable sort with the
+           given arguments, calls RAJA::merge_sort
+*/
+struct StableSorter
+{
+  template < typename... Args >
+  RAJA_INLINE
+  void operator()(Args&&... args) const
+  {
+    RAJA::merge_sort(std::forward<Args>(args)...);
+  }
+};
+
+} // namespace detail
+
+/*!
+        \brief sort given range using comparison function
+*/
+template <typename ExecPolicy, typename Iter, typename Compare>
+concepts::enable_if<type_traits::is_loop_policy<ExecPolicy>>
+unstable(const ExecPolicy&,
+         Iter begin,
+         Iter end,
+         Compare comp)
+{
+  detail::UnstableSorter{}(begin, end, comp);
+}
+
+/*!
+        \brief stable sort given range using comparison function
+*/
+template <typename ExecPolicy, typename Iter, typename Compare>
+concepts::enable_if<type_traits::is_loop_policy<ExecPolicy>>
+stable(const ExecPolicy&,
+            Iter begin,
+            Iter end,
+            Compare comp)
+{
+  detail::StableSorter{}(begin, end, comp);
+}
+
+/*!
+        \brief sort given range of pairs using comparison function on keys
+*/
+template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+concepts::enable_if<type_traits::is_loop_policy<ExecPolicy>>
+unstable_pairs(const ExecPolicy&,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               Compare comp)
+{
+  auto begin = RAJA::zip(keys_begin, vals_begin);
+  auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+  detail::UnstableSorter{}(begin, end, RAJA::compare_first<zip_ref>(comp));
+}
+
+/*!
+        \brief stable sort given range of pairs using comparison function on keys
+*/
+template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+concepts::enable_if<type_traits::is_loop_policy<ExecPolicy>>
+stable_pairs(const ExecPolicy&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             Compare comp)
+{
+  auto begin = RAJA::zip(keys_begin, vals_begin);
+  auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+  detail::StableSorter{}(begin, end, RAJA::compare_first<zip_ref>(comp));
+}
+
+}  // namespace sort
+
+}  // namespace impl
+
+}  // namespace RAJA
+
+#endif
diff --git a/include/RAJA/policy/openmp.hpp b/include/RAJA/policy/openmp.hpp
index 0dbb7d3b43..815e4df63a 100644
--- a/include/RAJA/policy/openmp.hpp
+++ b/include/RAJA/policy/openmp.hpp
@@ -36,7 +36,10 @@
 #include "RAJA/policy/openmp/reduce.hpp"
 #include "RAJA/policy/openmp/region.hpp"
 #include "RAJA/policy/openmp/scan.hpp"
+#include "RAJA/policy/openmp/sort.hpp"
 #include "RAJA/policy/openmp/synchronize.hpp"
+#include "RAJA/policy/openmp/WorkGroup.hpp"
+
 
 #endif  // closing endif for if defined(RAJA_ENABLE_OPENMP)
 
diff --git a/include/RAJA/policy/openmp/WorkGroup.hpp b/include/RAJA/policy/openmp/WorkGroup.hpp
new file mode 100644
index 0000000000..855a1a92a5
--- /dev/null
+++ b/include/RAJA/policy/openmp/WorkGroup.hpp
@@ -0,0 +1,24 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA Vtable and WorkRunner constructs.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_openmp_WorkGroup_HPP
+#define RAJA_openmp_WorkGroup_HPP
+
+#include "RAJA/policy/openmp/WorkGroup/Vtable.hpp"
+#include "RAJA/policy/openmp/WorkGroup/WorkRunner.hpp"
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/WorkGroup/Vtable.hpp b/include/RAJA/policy/openmp/WorkGroup/Vtable.hpp
new file mode 100644
index 0000000000..f0853e1549
--- /dev/null
+++ b/include/RAJA/policy/openmp/WorkGroup/Vtable.hpp
@@ -0,0 +1,47 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA workgroup Vtable.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_openmp_WorkGroup_Vtable_HPP
+#define RAJA_openmp_WorkGroup_Vtable_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/openmp/policy.hpp"
+
+#include "RAJA/policy/loop/WorkGroup/Vtable.hpp"
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+* Populate and return a Vtable object
+*/
+template < typename T, typename Vtable_T >
+inline const Vtable_T* get_Vtable(omp_work const&)
+{
+  return get_Vtable<T, Vtable_T>(loop_work{});
+}
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
new file mode 100644
index 0000000000..ddc3383e9e
--- /dev/null
+++ b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
@@ -0,0 +1,82 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA WorkRunner class specializations.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_openmp_WorkGroup_WorkRunner_HPP
+#define RAJA_openmp_WorkGroup_WorkRunner_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/openmp/policy.hpp"
+
+#include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+ * Runs work in a storage container in order
+ * and returns any per run resources
+ */
+template <typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::omp_work,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallOrdered<
+        RAJA::omp_parallel_for_exec,
+        RAJA::omp_work,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{ };
+
+/*!
+ * Runs work in a storage container in reverse order
+ * and returns any per run resources
+ */
+template <typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::omp_work,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallReverse<
+        RAJA::omp_parallel_for_exec,
+        RAJA::omp_work,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{ };
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/forall.hpp b/include/RAJA/policy/openmp/forall.hpp
index 99ac6de728..a1ee2dbb10 100644
--- a/include/RAJA/policy/openmp/forall.hpp
+++ b/include/RAJA/policy/openmp/forall.hpp
@@ -52,68 +52,159 @@ namespace policy
 namespace omp
 {
 ///
-/// OpenMP parallel for policy implementation
+/// OpenMP parallel policy implementation
 ///
-
 template <typename Iterable, typename Func, typename InnerPolicy>
-RAJA_INLINE void forall_impl(const omp_parallel_exec<InnerPolicy>&,
-                             Iterable&& iter,
-                             Func&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(resources::Host &host_res,
+                                                    const omp_parallel_exec<InnerPolicy>&,
+                                                    Iterable&& iter,
+                                                    Func&& loop_body)
 {
-
   RAJA::region<RAJA::omp_parallel_region>([&]() {
     using RAJA::internal::thread_privatize;
     auto body = thread_privatize(loop_body);
-    forall_impl(InnerPolicy{}, iter, body.get_priv());
+    forall_impl(host_res, InnerPolicy{}, iter, body.get_priv());
   });
+
+  return resources::EventProxy<resources::Host>(&host_res);
 }
 
+
 ///
-/// OpenMP for nowait policy implementation
+/// OpenMP parallel for schedule policy implementation
 ///
 
-template <typename Iterable, typename Func>
-RAJA_INLINE void forall_impl(const omp_for_nowait_exec&,
-                             Iterable&& iter,
-                             Func&& loop_body)
+namespace internal
 {
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp for nowait
-  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-    loop_body(begin_it[i]);
+
+  /// Tag dispatch for omp forall
+
+  template <typename Iterable, typename Func>
+  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Auto&,
+                               Iterable&& iter,
+                               Func&& loop_body)
+  {
+    RAJA_EXTRACT_BED_IT(iter);
+    #pragma omp for
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      loop_body(begin_it[i]);
+    }
   }
-}
 
-///
-/// OpenMP parallel for policy implementation
-///
+  template <typename Iterable, typename Func, int ChunkSize>
+  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                               Iterable&& iter,
+                               Func&& loop_body)
+  {
+    RAJA_EXTRACT_BED_IT(iter);
+    #pragma omp for schedule(static, ChunkSize)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      loop_body(begin_it[i]);
+    }
+  }
 
-template <typename Iterable, typename Func>
-RAJA_INLINE void forall_impl(const omp_for_exec&,
-                             Iterable&& iter,
-                             Func&& loop_body)
-{
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp for
-  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-    loop_body(begin_it[i]);
+  template <typename Iterable, typename Func>
+  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime&,
+                               Iterable&& iter,
+                               Func&& loop_body)
+  {
+    RAJA_EXTRACT_BED_IT(iter);
+    #pragma omp for schedule(runtime)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      loop_body(begin_it[i]);
+    }
   }
-}
 
-///
-/// OpenMP parallel for static policy implementation
-///
+  #if !defined(RAJA_COMPILER_MSVC)
+  // dynamic & guided
+  template <typename Policy, typename Iterable, typename Func>
+  RAJA_INLINE void forall_impl(const Policy&,
+                               Iterable&& iter,
+                               Func&& loop_body)
+  {
+    omp_sched_t prev_sched;
+    int prev_chunk;
+    omp_get_schedule(&prev_sched, &prev_chunk);
+    omp_set_schedule(Policy::schedule, Policy::chunk_size);
+    forall_impl(::RAJA::policy::omp::Runtime{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+    omp_set_schedule(prev_sched, prev_chunk);
+  }
+  #endif
+
+  /// Tag dispatch for omp forall with nowait
+
+  template <typename Iterable, typename Func>
+  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto&,
+                               Iterable&& iter,
+                               Func&& loop_body)
+  {
+    RAJA_EXTRACT_BED_IT(iter);
+    #pragma omp for nowait
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      loop_body(begin_it[i]);
+    }
+  }
 
-template <typename Iterable, typename Func, size_t ChunkSize>
-RAJA_INLINE void forall_impl(const omp_for_static<ChunkSize>&,
-                             Iterable&& iter,
-                             Func&& loop_body)
-{
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp for schedule(static, ChunkSize)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-    loop_body(begin_it[i]);
+  template <typename Iterable, typename Func, int ChunkSize>
+  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                               Iterable&& iter,
+                               Func&& loop_body)
+  {
+    RAJA_EXTRACT_BED_IT(iter);
+    #pragma omp for schedule(static, ChunkSize) nowait
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      loop_body(begin_it[i]);
+    }
+  }
+
+  template <typename Iterable, typename Func>
+  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Runtime&,
+                               Iterable&& iter,
+                               Func&& loop_body)
+  {
+    RAJA_EXTRACT_BED_IT(iter);
+    #pragma omp for schedule(runtime) nowait
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      loop_body(begin_it[i]);
+    }
   }
+
+  #if !defined(RAJA_COMPILER_MSVC)
+  // dynamic & guided
+  template <typename Policy, typename Iterable, typename Func>
+  RAJA_INLINE void forall_impl_nowait(const Policy&,
+                               Iterable&& iter,
+                               Func&& loop_body)
+  {
+    omp_sched_t prev_sched;
+    int prev_chunk;
+    omp_get_schedule(&prev_sched, &prev_chunk);
+    omp_set_schedule(Policy::schedule, Policy::chunk_size);
+    forall_impl_nowait(::RAJA::policy::omp::Runtime{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+    omp_set_schedule(prev_sched, prev_chunk);
+  }
+  #endif
+
+} // end namespace internal
+
+template <typename Schedule, typename Iterable, typename Func>
+RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(resources::Host& host_res,
+                                                               const omp_for_schedule_exec<Schedule>&,
+                                                               Iterable&& iter,
+                                                               Func&& loop_body)
+{
+  internal::forall_impl(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+  return resources::EventProxy<resources::Host>(&host_res);
+}
+
+template <typename Schedule, typename Iterable, typename Func>
+RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(resources::Host& host_res,
+                                                               const omp_for_nowait_schedule_exec<Schedule>&,
+                                                               Iterable&& iter,
+                                                               Func&& loop_body)
+{
+  internal::forall_impl_nowait(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+  return resources::EventProxy<resources::Host>(&host_res);
 }
 
 //
diff --git a/include/RAJA/policy/openmp/kernel.hpp b/include/RAJA/policy/openmp/kernel.hpp
index 74f8b0234c..e842370186 100644
--- a/include/RAJA/policy/openmp/kernel.hpp
+++ b/include/RAJA/policy/openmp/kernel.hpp
@@ -20,5 +20,6 @@
 #define RAJA_policy_openmp_kernel_HPP
 
 #include "RAJA/policy/openmp/kernel/Collapse.hpp"
+#include "RAJA/policy/openmp/kernel/OmpSyncThreads.hpp"
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/kernel/Collapse.hpp b/include/RAJA/policy/openmp/kernel/Collapse.hpp
index 9b62313a55..c0c04ad668 100644
--- a/include/RAJA/policy/openmp/kernel/Collapse.hpp
+++ b/include/RAJA/policy/openmp/kernel/Collapse.hpp
@@ -15,16 +15,13 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef RAJA_policy_openmp_nested_HPP
-#define RAJA_policy_openmp_nested_HPP
+#ifndef RAJA_policy_openmp_kernel_collapse_HPP
+#define RAJA_policy_openmp_kernel_collapse_HPP
 
 #include "RAJA/config.hpp"
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-#include <cassert>
-#include <climits>
-
 #include "RAJA/pattern/detail/privatizer.hpp"
 
 #include "RAJA/pattern/kernel/Collapse.hpp"
@@ -35,14 +32,6 @@
 
 #include "RAJA/policy/openmp/policy.hpp"
 
-#include "RAJA/internal/LegacyCompatibility.hpp"
-
-#if !defined(RAJA_COMPILER_MSVC)
-#define RAJA_COLLAPSE(X) collapse(X)
-#else
-#define RAJA_COLLAPSE(X)
-#endif
-
 namespace RAJA
 {
 
@@ -59,10 +48,10 @@ namespace internal
 // Collapsing two loops
 /////////
 
-template <camp::idx_t Arg0, camp::idx_t Arg1, typename... EnclosedStmts>
+template <camp::idx_t Arg0, camp::idx_t Arg1, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1>,
-                                             EnclosedStmts...>> {
+                                             EnclosedStmts...>, Types> {
 
 
   template <typename Data>
@@ -76,6 +65,10 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
     auto i0 = l0;
     auto i1 = l1;
 
+    // Set the argument types for this loop
+    using NewTypes0 = setSegmentTypeFromData<Types, Arg0, Data>;
+    using NewTypes1 = setSegmentTypeFromData<NewTypes0, Arg1, Data>;
+
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
 #pragma omp parallel for private(i0, i1) firstprivate(privatizer) \
@@ -85,7 +78,7 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
         auto& private_data = privatizer.get_priv();
         private_data.template assign_offset<Arg0>(i0);
         private_data.template assign_offset<Arg1>(i1);
-        execute_statement_list<camp::list<EnclosedStmts...>>(private_data);
+        execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(private_data);
       }
     }
   }
@@ -95,10 +88,11 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
 template <camp::idx_t Arg0,
           camp::idx_t Arg1,
           camp::idx_t Arg2,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2>,
-                                             EnclosedStmts...>> {
+                                             EnclosedStmts...>, Types> {
 
 
   template <typename Data>
@@ -111,6 +105,11 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
     auto i1 = l1;
     auto i2 = l2;
 
+    // Set the argument types for this loop
+    using NewTypes0 = setSegmentTypeFromData<Types, Arg0, Data>;
+    using NewTypes1 = setSegmentTypeFromData<NewTypes0, Arg1, Data>;
+    using NewTypes2 = setSegmentTypeFromData<NewTypes1, Arg2, Data>;
+
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
 #pragma omp parallel for private(i0, i1, i2) firstprivate(privatizer) \
@@ -122,7 +121,7 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
           private_data.template assign_offset<Arg0>(i0);
           private_data.template assign_offset<Arg1>(i1);
           private_data.template assign_offset<Arg2>(i2);
-          execute_statement_list<camp::list<EnclosedStmts...>>(private_data);
+          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(private_data);
         }
       }
     }
diff --git a/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp b/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
new file mode 100644
index 0000000000..70a152514e
--- /dev/null
+++ b/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
@@ -0,0 +1,71 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file containing constructs used to run kernel
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_policy_openmp_kernel_ompsyncthreads_HPP
+#define RAJA_policy_openmp_kernel_ompsyncthreads_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+#include "RAJA/pattern/kernel/internal.hpp"
+
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
+
+#include "RAJA/policy/openmp/policy.hpp"
+
+
+
+namespace RAJA
+{
+
+namespace statement
+{
+struct OmpSyncThreads : public internal::Statement<camp::nil> {
+};
+
+} // namespace statement
+
+namespace internal
+{
+
+
+
+//Statement executor to synchronize omp threads inside a kernel region
+template<typename Types>
+struct StatementExecutor<statement::OmpSyncThreads, Types> {
+
+template<typename Data>
+static RAJA_INLINE void exec(Data &&)
+{
+  #pragma omp barrier
+}
+
+};
+
+
+
+
+
+}  // namespace internal
+}  // namespace RAJA
+
+
+#endif  // closing endif for RAJA_ENABLE_OPENMP guard
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/policy.hpp b/include/RAJA/policy/openmp/policy.hpp
index 7a82e5f267..d1e7cda363 100644
--- a/include/RAJA/policy/openmp/policy.hpp
+++ b/include/RAJA/policy/openmp/policy.hpp
@@ -22,14 +22,47 @@
 
 #include "RAJA/policy/PolicyBase.hpp"
 
+#if defined(RAJA_COMPILER_MSVC)
+typedef enum omp_sched_t { 
+    // schedule kinds 
+    omp_sched_static = 0x1, 
+    omp_sched_dynamic = 0x2, 
+    omp_sched_guided = 0x3, 
+    omp_sched_auto = 0x4, 
+    
+    // schedule modifier 
+    omp_sched_monotonic = 0x80000000u 
+} omp_sched_t;
+#else
+#include <omp.h>
+#endif
+
 namespace RAJA
 {
 namespace policy
 {
-
 namespace omp
 {
 
+namespace internal
+{
+    struct ScheduleTag {};
+
+    template <omp_sched_t Sched, int Chunk>
+    struct Schedule : public ScheduleTag {
+        constexpr static omp_sched_t schedule = Sched;
+        constexpr static int chunk_size = Chunk;
+    };
+}  // namespace internal
+
+//
+//////////////////////////////////////////////////////////////////////
+//
+// Clauses/Keywords
+//
+//////////////////////////////////////////////////////////////////////
+//
+
 struct Parallel {
 };
 
@@ -42,10 +75,23 @@ struct For {
 struct NoWait {
 };
 
-template <unsigned int ChunkSize>
-struct Static : std::integral_constant<unsigned int, ChunkSize> {
+static constexpr int default_chunk_size = -1;
+
+struct Auto : private internal::Schedule<omp_sched_auto, default_chunk_size>{
+};
+
+template <int ChunkSize = default_chunk_size>
+struct Static : public internal::Schedule<omp_sched_static, ChunkSize> {
 };
 
+template <int ChunkSize = default_chunk_size>
+using Dynamic = internal::Schedule<omp_sched_dynamic, ChunkSize>;
+
+template <int ChunkSize = default_chunk_size>
+using Guided = internal::Schedule<omp_sched_guided, ChunkSize>;
+
+struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1), default_chunk_size> {
+};
 
 //
 //////////////////////////////////////////////////////////////////////
@@ -62,45 +108,49 @@ struct omp_parallel_region
                                             Platform::host> {
 };
 
-struct omp_for_exec
-    : make_policy_pattern_t<Policy::openmp, Pattern::forall, omp::For> {
+template <typename Sched>
+struct omp_for_nowait_schedule_exec : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                                              Pattern::forall,
+                                                              Launch::undefined,
+                                                              Platform::host,
+                                                              omp::For,
+                                                              omp::NoWait,
+                                                              Sched> {
+    static_assert(std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
+        "Schedule must be one of: Auto|Runtime|Static|Dynamic|Guided");
 };
 
-struct omp_for_nowait_exec
-    : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::forall,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            omp::For,
-                                            omp::NoWait> {
-};
 
-template <unsigned int N>
-struct omp_for_static : make_policy_pattern_launch_platform_t<Policy::openmp,
+template <typename Sched>
+struct omp_for_schedule_exec : make_policy_pattern_launch_platform_t<Policy::openmp,
                                                               Pattern::forall,
                                                               Launch::undefined,
                                                               Platform::host,
                                                               omp::For,
-                                                              omp::Static<N>> {
+                                                              Sched> {
+    static_assert(std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
+        "Schedule must be one of: Auto|Runtime|Static|Dynamic|Guided");
 };
 
+using omp_for_exec = omp_for_schedule_exec<Auto>;
+
+using omp_for_nowait_exec = omp_for_nowait_schedule_exec<Auto>;
+
+template <unsigned int N>
+using omp_for_static = omp_for_schedule_exec<omp::Static<N>>;
 
 template <typename InnerPolicy>
-struct omp_parallel_exec
-    : make_policy_pattern_launch_platform_t<Policy::openmp,
+using omp_parallel_exec = make_policy_pattern_launch_platform_t<Policy::openmp,
                                             Pattern::forall,
                                             Launch::undefined,
                                             Platform::host,
                                             omp::Parallel,
-                                            wrapper<InnerPolicy>> {
-};
+                                            wrapper<InnerPolicy>>;
 
-struct omp_parallel_for_exec : omp_parallel_exec<omp_for_exec> {
-};
+using omp_parallel_for_exec = omp_parallel_exec<omp_for_exec>;
 
 template <unsigned int N>
-struct omp_parallel_for_static : omp_parallel_exec<omp_for_static<N>> {
-};
+using omp_parallel_for_static = omp_parallel_exec<omp_for_static<N>>;
 
 
 ///
@@ -119,6 +169,15 @@ struct omp_taskgraph_interval_segit
     : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel> {
 };
 
+///
+/// WorkGroup execution policies
+///
+struct omp_work : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                                        Pattern::workgroup_exec,
+                                                        Launch::sync,
+                                                        Platform::host> {
+};
+
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
@@ -144,6 +203,8 @@ struct omp_synchronize : make_policy_pattern_launch_t<Policy::openmp,
 
 using policy::omp::omp_for_exec;
 using policy::omp::omp_for_nowait_exec;
+using policy::omp::omp_for_schedule_exec;
+using policy::omp::omp_for_nowait_schedule_exec;
 using policy::omp::omp_for_static;
 using policy::omp::omp_parallel_exec;
 using policy::omp::omp_parallel_for_exec;
@@ -153,11 +214,8 @@ using policy::omp::omp_parallel_segit;
 using policy::omp::omp_reduce;
 using policy::omp::omp_reduce_ordered;
 using policy::omp::omp_synchronize;
-
-
-
+using policy::omp::omp_work;
 
 }  // namespace RAJA
 
-
 #endif
diff --git a/include/RAJA/policy/openmp/scan.hpp b/include/RAJA/policy/openmp/scan.hpp
index f738e1441f..8d887a8afd 100644
--- a/include/RAJA/policy/openmp/scan.hpp
+++ b/include/RAJA/policy/openmp/scan.hpp
@@ -29,7 +29,8 @@
 #include <omp.h>
 
 #include "RAJA/policy/openmp/policy.hpp"
-#include "RAJA/policy/sequential/scan.hpp"
+#include "RAJA/policy/loop/scan.hpp"
+#include "RAJA/pattern/detail/algorithm.hpp"
 
 namespace RAJA
 {
@@ -38,12 +39,6 @@ namespace impl
 namespace scan
 {
 
-RAJA_INLINE
-int firstIndex(int n, int p, int pid)
-{
-  return (static_cast<size_t>(n) * pid) / p;
-}
-
 /*!
         \brief explicit inclusive inplace scan given range, function, and
    initial value
@@ -55,24 +50,29 @@ concepts::enable_if<type_traits::is_openmp_policy<Policy>> inclusive_inplace(
     Iter end,
     BinFn f)
 {
+  using std::distance;
+  using RAJA::detail::firstIndex;
   using Value = typename ::std::iterator_traits<Iter>::value_type;
-  const int n = end - begin;
-  const int p0 = std::min(n, omp_get_max_threads());
+  const auto n = distance(begin, end);
+  using DistanceT = typename std::remove_const<decltype(n)>::type;
+  const int p0 = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
   ::std::vector<Value> sums(p0, Value());
 #pragma omp parallel num_threads(p0)
   {
     const int p = omp_get_num_threads();
     const int pid = omp_get_thread_num();
-    const int i0 = firstIndex(n, p, pid);
-    const int i1 = firstIndex(n, p, pid + 1);
-    inclusive_inplace(::RAJA::seq_exec{}, begin + i0, begin + i1, f);
-    sums[pid] = *(begin + i1 - 1);
+    const DistanceT idx_begin = firstIndex(n, p, pid);
+    const DistanceT idx_end = firstIndex(n, p, pid + 1);
+    if (idx_begin != idx_end) {
+      inclusive_inplace(::RAJA::loop_exec{}, begin + idx_begin, begin + idx_end, f);
+      sums[pid] = begin[idx_end - 1];
+    }
 #pragma omp barrier
 #pragma omp single
     exclusive_inplace(
-        ::RAJA::seq_exec{}, sums.data(), sums.data() + p, f, BinFn::identity());
-    for (int i = i0; i < i1; ++i) {
-      *(begin + i) = f(*(begin + i), sums[pid]);
+        ::RAJA::loop_exec{}, sums.data(), sums.data() + p, f, BinFn::identity());
+    for (auto i = idx_begin; i < idx_end; ++i) {
+      begin[i] = f(begin[i], sums[pid]);
     }
   }
 }
@@ -89,26 +89,31 @@ concepts::enable_if<type_traits::is_openmp_policy<Policy>> exclusive_inplace(
     BinFn f,
     ValueT v)
 {
+  using std::distance;
+  using RAJA::detail::firstIndex;
   using Value = typename ::std::iterator_traits<Iter>::value_type;
-  const int n = end - begin;
-  const int p0 = std::min(n, omp_get_max_threads());
+  const auto n = distance(begin, end);
+  using DistanceT = typename std::remove_const<decltype(n)>::type;
+  const int p0 = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
   ::std::vector<Value> sums(p0, v);
 #pragma omp parallel num_threads(p0)
   {
     const int p = omp_get_num_threads();
     const int pid = omp_get_thread_num();
-    const int i0 = firstIndex(n, p, pid);
-    const int i1 = firstIndex(n, p, pid + 1);
-    const Value init = ((pid == 0) ? v : *(begin + i0 - 1));
+    const DistanceT idx_begin = firstIndex(n, p, pid);
+    const DistanceT idx_end = firstIndex(n, p, pid + 1);
+    const Value init = ((pid == 0) ? v : *(begin + idx_begin - 1));
 #pragma omp barrier
-    exclusive_inplace(seq_exec{}, begin + i0, begin + i1, f, init);
-    sums[pid] = *(begin + i1 - 1);
+    if (idx_begin != idx_end) {
+      exclusive_inplace(loop_exec{}, begin + idx_begin, begin + idx_end, f, init);
+      sums[pid] = begin[idx_end - 1];
+    }
 #pragma omp barrier
 #pragma omp single
     exclusive_inplace(
-        seq_exec{}, sums.data(), sums.data() + p, f, BinFn::identity());
-    for (int i = i0; i < i1; ++i) {
-      *(begin + i) = f(*(begin + i), sums[pid]);
+        loop_exec{}, sums.data(), sums.data() + p, f, BinFn::identity());
+    for (auto i = idx_begin; i < idx_end; ++i) {
+      begin[i] = f(begin[i], sums[pid]);
     }
   }
 }
@@ -125,8 +130,9 @@ concepts::enable_if<type_traits::is_openmp_policy<Policy>> inclusive(
     OutIter out,
     BinFn f)
 {
+  using std::distance;
   ::std::copy(begin, end, out);
-  inclusive_inplace(exec, out, out + (end - begin), f);
+  inclusive_inplace(exec, out, out + distance(begin, end), f);
 }
 
 /*!
@@ -146,8 +152,9 @@ concepts::enable_if<type_traits::is_openmp_policy<Policy>> exclusive(
     BinFn f,
     ValueT v)
 {
+  using std::distance;
   ::std::copy(begin, end, out);
-  exclusive_inplace(exec, out, out + (end - begin), f, v);
+  exclusive_inplace(exec, out, out + distance(begin, end), f, v);
 }
 
 }  // namespace scan
diff --git a/include/RAJA/policy/openmp/sort.hpp b/include/RAJA/policy/openmp/sort.hpp
new file mode 100644
index 0000000000..16c1f0012c
--- /dev/null
+++ b/include/RAJA/policy/openmp/sort.hpp
@@ -0,0 +1,257 @@
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file providing RAJA sort declarations.
+*
+******************************************************************************
+*/
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_sort_openmp_HPP
+#define RAJA_sort_openmp_HPP
+
+#include "RAJA/config.hpp"
+
+#include <algorithm>
+#include <functional>
+#include <iterator>
+
+#include <omp.h>
+
+#include "RAJA/util/macros.hpp"
+
+#include "RAJA/util/concepts.hpp"
+
+#include "RAJA/policy/openmp/policy.hpp"
+#include "RAJA/policy/loop/sort.hpp"
+#include "RAJA/pattern/detail/algorithm.hpp"
+
+namespace RAJA
+{
+namespace impl
+{
+namespace sort
+{
+
+namespace detail
+{
+namespace openmp
+{
+
+// this number is arbitrary
+constexpr int get_min_iterates_per_task() { return 128; }
+
+#ifdef RAJA_ENABLE_OPENMP_TASK
+/*!
+        \brief sort given range using sorter and comparison function
+               by spawning tasks
+*/
+template <typename Sorter, typename Iter, typename Compare>
+inline void sort_task(Sorter sorter,
+                      Iter begin,
+                      RAJA::detail::IterDiff<Iter> i_begin,
+                      RAJA::detail::IterDiff<Iter> i_end,
+                      RAJA::detail::IterDiff<Iter> iterates_per_task,
+                      Compare comp)
+{
+  using diff_type = RAJA::detail::IterDiff<Iter>;
+  const diff_type n = i_end - i_begin;
+
+  if (n <= iterates_per_task) {
+
+    sorter(begin+i_begin, begin+i_end, comp);
+
+  } else {
+
+    const diff_type i_middle = i_begin + n/2;
+
+#pragma omp task
+    sort_task(sorter, begin, i_begin, i_middle, iterates_per_task, comp);
+
+#pragma omp task
+    sort_task(sorter, begin, i_middle, i_end, iterates_per_task, comp);
+
+#pragma omp taskwait
+
+    //std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
+    RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
+  }
+}
+
+#else
+
+/*!
+        \brief sort given range using sorter and comparison function
+               by manually assigning work to threads
+*/
+template <typename Sorter, typename Iter, typename Compare>
+inline void sort_parallel_region(Sorter sorter,
+                                 Iter begin,
+                                 RAJA::detail::IterDiff<Iter> n,
+                                 Compare comp)
+{
+  using RAJA::detail::firstIndex;
+  using diff_type = RAJA::detail::IterDiff<Iter>;
+
+  const diff_type num_threads = omp_get_num_threads();
+
+  const diff_type thread_id = omp_get_thread_num();
+
+  const diff_type i_begin = firstIndex(n, num_threads, thread_id);
+  {
+    const diff_type i_end = firstIndex(n, num_threads, thread_id + 1);
+
+    // this thread sorts range [i_begin, i_end)
+    sorter(begin + i_begin, begin + i_end, comp);
+  }
+
+  // hierarchically merge ranges
+  for (diff_type middle_offset = 1; middle_offset < num_threads; middle_offset *= 2) {
+
+    diff_type end_offset = 2*middle_offset;
+
+    const diff_type i_middle = firstIndex(n, num_threads, std::min(thread_id + middle_offset, num_threads));
+    const diff_type i_end    = firstIndex(n, num_threads, std::min(thread_id + end_offset,    num_threads));
+
+#pragma omp barrier
+
+    if (thread_id % end_offset == 0) {
+
+      // this thread merges ranges [i_begin, i_middle) and [i_middle, i_end)
+      //std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
+      RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
+    }
+  }
+}
+
+#endif
+
+
+/*!
+        \brief sort given range using sorter and comparison function
+*/
+template <typename Sorter, typename Iter, typename Compare>
+inline
+void sort(Sorter sorter,
+          Iter begin,
+          Iter end,
+          Compare comp)
+{
+  using diff_type = RAJA::detail::IterDiff<Iter>;
+
+  constexpr diff_type min_iterates_per_task = get_min_iterates_per_task();
+
+  const diff_type n = end - begin;
+
+  if (n <= min_iterates_per_task) {
+
+    sorter(begin, end, comp);
+
+  } else {
+
+    const diff_type max_threads = omp_get_max_threads();
+
+#ifdef RAJA_ENABLE_OPENMP_TASK
+
+    const diff_type iterates_per_task = std::max(n/(2*max_threads), min_iterates_per_task);
+
+    const diff_type requested_num_threads = std::min((n+iterates_per_task-1)/iterates_per_task, max_threads);
+
+#pragma omp parallel num_threads(static_cast<int>(requested_num_threads))
+#pragma omp master
+    {
+      sort_task(sorter, begin, 0, n, iterates_per_task, comp);
+    }
+
+#else
+
+    const diff_type requested_num_threads = std::min((n+min_iterates_per_task-1)/min_iterates_per_task, max_threads);
+
+#pragma omp parallel num_threads(static_cast<int>(requested_num_threads))
+    {
+      sort_parallel_region(sorter, begin, n, comp);
+    }
+
+#endif
+  }
+}
+
+} // namespace openmp
+
+} // namespace detail
+
+/*!
+        \brief sort given range using comparison function
+*/
+template <typename ExecPolicy, typename Iter, typename Compare>
+concepts::enable_if<type_traits::is_openmp_policy<ExecPolicy>>
+unstable(const ExecPolicy&,
+         Iter begin,
+         Iter end,
+         Compare comp)
+{
+  detail::openmp::sort(detail::UnstableSorter{}, begin, end, comp);
+}
+
+/*!
+        \brief stable sort given range using comparison function
+*/
+template <typename ExecPolicy, typename Iter, typename Compare>
+concepts::enable_if<type_traits::is_openmp_policy<ExecPolicy>>
+stable(const ExecPolicy&,
+            Iter begin,
+            Iter end,
+            Compare comp)
+{
+  detail::openmp::sort(detail::StableSorter{}, begin, end, comp);
+}
+
+/*!
+        \brief sort given range of pairs using comparison function on keys
+*/
+template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+concepts::enable_if<type_traits::is_openmp_policy<ExecPolicy>>
+unstable_pairs(const ExecPolicy&,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               Compare comp)
+{
+  auto begin  = RAJA::zip(keys_begin, vals_begin);
+  auto end    = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+  detail::openmp::sort(detail::UnstableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
+}
+
+/*!
+        \brief stable sort given range of pairs using comparison function on keys
+*/
+template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+concepts::enable_if<type_traits::is_openmp_policy<ExecPolicy>>
+stable_pairs(const ExecPolicy&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             Compare comp)
+{
+  auto begin  = RAJA::zip(keys_begin, vals_begin);
+  auto end    = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+  detail::openmp::sort(detail::StableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
+}
+
+}  // namespace sort
+
+}  // namespace impl
+
+}  // namespace RAJA
+
+#endif
diff --git a/include/RAJA/policy/openmp_target.hpp b/include/RAJA/policy/openmp_target.hpp
index 356cdebfa3..cdcfb9a265 100644
--- a/include/RAJA/policy/openmp_target.hpp
+++ b/include/RAJA/policy/openmp_target.hpp
@@ -30,6 +30,8 @@
 #include "RAJA/policy/openmp_target/kernel.hpp"
 #include "RAJA/policy/openmp_target/forall.hpp"
 #include "RAJA/policy/openmp_target/reduce.hpp"
+#include "RAJA/policy/openmp_target/WorkGroup.hpp"
+
 
 #endif  // closing endif for if defined(RAJA_ENABLE_OPENMP) && defined(RAJA_ENABLE_TARGET_OPENMP)
 
diff --git a/include/RAJA/policy/openmp_target/WorkGroup.hpp b/include/RAJA/policy/openmp_target/WorkGroup.hpp
new file mode 100644
index 0000000000..f50805140f
--- /dev/null
+++ b/include/RAJA/policy/openmp_target/WorkGroup.hpp
@@ -0,0 +1,24 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA Vtable and WorkRunner constructs.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_openmp_target_WorkGroup_HPP
+#define RAJA_openmp_target_WorkGroup_HPP
+
+#include "RAJA/policy/openmp_target/WorkGroup/Vtable.hpp"
+#include "RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp"
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/Vtable.hpp b/include/RAJA/policy/openmp_target/WorkGroup/Vtable.hpp
new file mode 100644
index 0000000000..0ed188180a
--- /dev/null
+++ b/include/RAJA/policy/openmp_target/WorkGroup/Vtable.hpp
@@ -0,0 +1,79 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA workgroup Vtable.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_openmp_target_WorkGroup_Vtable_HPP
+#define RAJA_openmp_target_WorkGroup_Vtable_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/openmp_target/policy.hpp"
+
+#include "RAJA/pattern/WorkGroup/Vtable.hpp"
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+// get the device function pointer by opening a target region and writing out
+// the pointer to the function call
+template < typename T, typename Vtable_T >
+inline typename Vtable_T::call_sig get_Vtable_omp_target_call()
+{
+  typename Vtable_T::call_sig ptr = nullptr;
+
+  #pragma omp target map(tofrom : ptr)
+  {
+    ptr = &Vtable_T::template host_call<T>;
+  }
+
+  return ptr;
+}
+
+// get the device function pointer and store it so it can be used
+// multiple times
+template < typename T, typename Vtable_T >
+inline typename Vtable_T::call_sig get_cached_Vtable_omp_target_call()
+{
+  static typename Vtable_T::call_sig ptr =
+      get_Vtable_omp_target_call<T, Vtable_T>();
+  return ptr;
+}
+
+/*!
+* Populate and return a Vtable object where the
+* call operator is a device function
+*/
+template < typename T, typename Vtable_T >
+inline const Vtable_T* get_Vtable(omp_target_work const&)
+{
+  static Vtable_T vtable{
+        &Vtable_T::template move_construct_destroy<T>,
+        get_cached_Vtable_omp_target_call<T, Vtable_T>(),
+        &Vtable_T::template destroy<T>,
+        sizeof(T)
+      };
+  return &vtable;
+}
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
new file mode 100644
index 0000000000..7006ac4bd7
--- /dev/null
+++ b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
@@ -0,0 +1,82 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA WorkRunner class specializations.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_openmp_target_WorkGroup_WorkRunner_HPP
+#define RAJA_openmp_target_WorkGroup_WorkRunner_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/openmp_target/policy.hpp"
+
+#include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+ * Runs work in a storage container in order
+ * and returns any per run resources
+ */
+template <typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::omp_target_work,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallOrdered<
+        RAJA::omp_target_parallel_for_exec_nt,
+        RAJA::omp_target_work,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{ };
+
+/*!
+ * Runs work in a storage container in reverse order
+ * and returns any per run resources
+ */
+template <typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::omp_target_work,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallReverse<
+        RAJA::omp_target_parallel_for_exec_nt,
+        RAJA::omp_target_work,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{ };
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp
index 006e361e83..1c88cb164f 100644
--- a/include/RAJA/policy/openmp_target/forall.hpp
+++ b/include/RAJA/policy/openmp_target/forall.hpp
@@ -32,10 +32,10 @@ namespace omp
 ///
 
 template <size_t ThreadsPerTeam, typename Iterable, typename Func>
-// RAJA_INLINE void forall(const omp_target_parallel_for_exec<Teams>&,
-RAJA_INLINE void forall_impl(const omp_target_parallel_for_exec<ThreadsPerTeam>&,
-                             Iterable&& iter,
-                             Func&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Omp> forall_impl(resources::Omp &omp_res,
+                                                              const omp_target_parallel_for_exec<ThreadsPerTeam>&,
+                                                              Iterable&& iter,
+                                                              Func&& loop_body)
 {
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
   Body body = loop_body;
@@ -68,12 +68,14 @@ RAJA_INLINE void forall_impl(const omp_target_parallel_for_exec<ThreadsPerTeam>&
     ib(begin_it[i]);
   }
 
+  return resources::EventProxy<resources::Omp>(&res);
 }
 
 template <typename Iterable, typename Func>
-RAJA_INLINE void forall_impl(const omp_target_parallel_for_exec_nt&,
-                             Iterable&& iter,
-                             Func&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Omp> forall_impl(resources::Resource &omp_res,
+                                                              const omp_target_parallel_for_exec_nt&,
+                                                              Iterable&& iter,
+                                                              Func&& loop_body)
 {
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
   Body body = loop_body;
@@ -81,13 +83,14 @@ RAJA_INLINE void forall_impl(const omp_target_parallel_for_exec_nt&,
   RAJA_EXTRACT_BED_IT(iter);
 
 #pragma omp target teams distribute parallel for schedule(static, 1) \
-    map(to : body)
+    firstprivate(body,begin_it)
   for (decltype(distance_it) i = 0; i < distance_it; ++i) {
     Body ib = body;
     ib(begin_it[i]);
   }
-}
 
+  return RAJA::resources::EventProxy<resources::Omp>(&res);
+}
 
 }  // namespace omp
 
diff --git a/include/RAJA/policy/openmp_target/kernel/Collapse.hpp b/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
index 8da0d96d9a..d0607c748a 100644
--- a/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
@@ -13,10 +13,10 @@
 namespace RAJA {
 namespace internal {
 
-template <camp::idx_t Arg0, camp::idx_t Arg1, typename... EnclosedStmts>
+template <camp::idx_t Arg0, camp::idx_t Arg1, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1>,
-                                             EnclosedStmts...>> 
+                                             EnclosedStmts...>, Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -33,16 +33,16 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
         for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
           private_data.template assign_offset<Arg0>(i0);
           private_data.template assign_offset<Arg1>(i1);
-          execute_statement_list<camp::list<EnclosedStmts...>>(private_data);
+          execute_statement_list<camp::list<EnclosedStmts...>, Types>(private_data);
         }
       }
     }
 };
 
-template <camp::idx_t Arg0, camp::idx_t Arg1, camp::idx_t Arg2, typename... EnclosedStmts>
+template <camp::idx_t Arg0, camp::idx_t Arg1, camp::idx_t Arg2, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2>,
-                                             EnclosedStmts...>> 
+                                             EnclosedStmts...>, Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -62,17 +62,17 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
             private_data.template assign_offset<Arg0>(i0);
             private_data.template assign_offset<Arg1>(i1);
             private_data.template assign_offset<Arg2>(i2);
-            execute_statement_list<camp::list<EnclosedStmts...>>(private_data);
+            execute_statement_list<camp::list<EnclosedStmts...>, Types>(private_data);
           }
         }
       }
     }
 };
 
-template <camp::idx_t Arg0, camp::idx_t Arg1, camp::idx_t Arg2, camp::idx_t Arg3, typename... EnclosedStmts>
+template <camp::idx_t Arg0, camp::idx_t Arg1, camp::idx_t Arg2, camp::idx_t Arg3, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2, Arg3>,
-                                             EnclosedStmts...>> 
+                                             EnclosedStmts...>, Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -95,7 +95,7 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
               private_data.template assign_offset<Arg1>(i1);
               private_data.template assign_offset<Arg2>(i2);
               private_data.template assign_offset<Arg3>(i2);
-              execute_statement_list<camp::list<EnclosedStmts...>>(private_data);
+              execute_statement_list<camp::list<EnclosedStmts...>, Types>(private_data);
             }
           }
         }
diff --git a/include/RAJA/policy/openmp_target/kernel/For.hpp b/include/RAJA/policy/openmp_target/kernel/For.hpp
index 7c2e75afef..4cfde34338 100644
--- a/include/RAJA/policy/openmp_target/kernel/For.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/For.hpp
@@ -13,7 +13,7 @@
 namespace RAJA {
 namespace internal {
 
-template <camp::idx_t ArgumentId, typename Data, typename... EnclosedStmts>
+template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
 struct OpenMPTargetForWrapper : public GenericWrapperBase 
 {
   using data_t = camp::decay<Data>;
@@ -28,7 +28,7 @@ struct OpenMPTargetForWrapper : public GenericWrapperBase
     data{d}  {}
 
   RAJA_INLINE
-  void exec() { execute_statement_list<camp::list<EnclosedStmts...>>(data); }
+  void exec() { execute_statement_list<camp::list<EnclosedStmts...>, Types>(data); }
 
   template <typename InIndexType>
   RAJA_INLINE void operator()(InIndexType i)
@@ -40,14 +40,15 @@ struct OpenMPTargetForWrapper : public GenericWrapperBase
 
 template <camp::idx_t ArgumentId,
           int N,
-          typename... EnclosedStmts>
-struct StatementExecutor<statement::For<ArgumentId, omp_target_parallel_for_exec<N>, EnclosedStmts...>> 
+          typename... EnclosedStmts,
+          typename Types>
+struct StatementExecutor<statement::For<ArgumentId, omp_target_parallel_for_exec<N>, EnclosedStmts...>, Types>
 {
 
   template <typename Data>
   static RAJA_INLINE void exec(Data &&data)
   {
-    OpenMPTargetForWrapper<ArgumentId, Data, EnclosedStmts...> for_wrapper(data);
+    OpenMPTargetForWrapper<ArgumentId, Data, Types, EnclosedStmts...> for_wrapper(data);
 
     auto len = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
@@ -56,26 +57,7 @@ struct StatementExecutor<statement::For<ArgumentId, omp_target_parallel_for_exec
   }
 };
 
-template <camp::idx_t ArgumentId,
-          typename... EnclosedStmts>
-struct StatementExecutor<statement::For<ArgumentId, seq_exec, EnclosedStmts...>>
-{
 
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
-  {
-    auto len = segment_length<ArgumentId>(data);
-
-    for (int i = 0; i < len; ++i) {
-      data.template assign_offset<ArgumentId>(i);
-
-      // execute enclosed statements
-      //enclosed_stmts.exec(data);
-      execute_statement_list<camp::list<EnclosedStmts...>>(data);
-    }
-
-  }
-};
 
 }
 }
diff --git a/include/RAJA/policy/openmp_target/policy.hpp b/include/RAJA/policy/openmp_target/policy.hpp
index aaba9fe156..129a7ffbac 100644
--- a/include/RAJA/policy/openmp_target/policy.hpp
+++ b/include/RAJA/policy/openmp_target/policy.hpp
@@ -57,6 +57,16 @@ struct omp_target_reduce
     : make_policy_pattern_t<Policy::target_openmp, Pattern::reduce> {
 };
 
+///
+/// WorkGroup execution policies
+///
+struct omp_target_work
+    : make_policy_pattern_launch_platform_t<Policy::target_openmp,
+                                            Pattern::workgroup_exec,
+                                            Launch::sync,
+                                            Platform::omp_target> {
+};
+
 
 }  // closing brace for omp namespace
 }  // closing brace for policy namespace
@@ -66,6 +76,7 @@ using policy::omp::omp_target_parallel_for_exec;
 using policy::omp::omp_target_parallel_for_exec_nt;
 using policy::omp::omp_target_reduce;
 using policy::omp::omp_target_parallel_collapse_exec;
+using policy::omp::omp_target_work;
 #endif
 
 } // closing brace for RAJA namespace
diff --git a/include/RAJA/policy/openmp_target/reduce.hpp b/include/RAJA/policy/openmp_target/reduce.hpp
index e1ee294744..a7d678fe7d 100644
--- a/include/RAJA/policy/openmp_target/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/reduce.hpp
@@ -100,8 +100,8 @@ struct Reduce_Data
    *
    *  allocates data on the host and device and initializes values to default
    */
-  Reduce_Data(T /*defaultValue*/, T identityValue, Offload_Info &info)
-      : value(identityValue),
+  Reduce_Data(T initValue, T identityValue, Offload_Info &info)
+     : value(initValue),
         device{reinterpret_cast<T *>(
             omp_target_alloc(omp::MaxNumTeams * sizeof(T), info.deviceID))},
         host{new T[omp::MaxNumTeams]}
@@ -118,6 +118,12 @@ struct Reduce_Data
     hostToDevice(info);
   }
 
+  void reset(T initValue)
+  {
+    value = initValue;
+  }
+
+
   //! default copy constructor for POD
   Reduce_Data(const Reduce_Data &) = default;
 
@@ -177,14 +183,22 @@ struct TargetReduce
   TargetReduce() = delete;
   TargetReduce(const TargetReduce &) = default;
 
-  explicit TargetReduce(T init_val)
+  explicit TargetReduce(T init_val_, T identity_ = Reducer::identity())
       : info(),
-        val(Reducer::identity(), Reducer::identity(), info),
-        initVal(init_val),
-        finalVal(Reducer::identity())
+        val(identity_, identity_, info),
+        initVal(init_val_),
+        finalVal(identity_)
   {
   }
 
+  void reset(T init_val_, T identity_ = Reducer::identity())
+  {
+    operator T();
+    val.reset(identity_);
+    initVal = init_val_;
+    finalVal = identity_;
+  }
+
 #ifdef __ibmxl__ // TODO: implicit declare target doesn't pick this up
 #pragma omp declare target
 #endif
@@ -254,17 +268,32 @@ struct TargetReduceLoc
 {
   TargetReduceLoc() = delete;
   TargetReduceLoc(const TargetReduceLoc &) = default;
-  explicit TargetReduceLoc(T init_val, IndexType init_loc)
+  explicit TargetReduceLoc(T init_val_, IndexType init_loc,
+                           T identity_ = Reducer::identity)
       : info(),
-        val(Reducer::identity, Reducer::identity, info),
+        val(identity_, identity_, info),
         loc(init_loc, IndexType(RAJA::reduce::detail::DefaultLoc<IndexType>().value()), info),
-        initVal(init_val),
-        finalVal(Reducer::identity),
+        initVal(init_val_),
+        finalVal(identity_),
         initLoc(init_loc),
         finalLoc(IndexType(RAJA::reduce::detail::DefaultLoc<IndexType>().value()))
   {
   }
 
+  void reset(T init_val_,
+             IndexType init_local_ =
+             IndexType(RAJA::reduce::detail::DefaultLoc<IndexType>().value()),
+             T identity_ = Reducer::identity)
+  {
+    operator T();
+    val.reset(identity_);
+    loc.reset(init_local_);
+    initVal = init_val_;
+    finalVal = identity_;
+    initLoc = reduce::detail::DefaultLoc<IndexType>().value();
+    finalLoc = IndexType(RAJA::reduce::detail::DefaultLoc<IndexType>().value());
+  }
+
   //! apply reduction on device upon destruction
   ~TargetReduceLoc()
   {
@@ -363,6 +392,57 @@ class ReduceSum<omp_target_reduce, T>
   }
 };
 
+//! specialization of ReduceBitOr for omp_target_reduce
+template <typename T>
+class ReduceBitOr<omp_target_reduce, T>
+    : public TargetReduce<RAJA::reduce::or_bit<T>, T>
+{
+public:
+
+  using self = ReduceBitOr<omp_target_reduce, T>;
+  using parent = TargetReduce<RAJA::reduce::or_bit<T>, T>;
+  using parent::parent;
+
+  //! enable operator|= for ReduceBitOr -- alias for reduce()
+  self &operator|=(T rhsVal)
+  {
+    parent::reduce(rhsVal);
+    return *this;
+  }
+
+  //! enable operator|= for ReduceBitOr -- alias for reduce()
+  const self &operator|=(T rhsVal) const
+  {
+    parent::reduce(rhsVal);
+    return *this;
+  }
+};
+
+//! specialization of ReduceBitAnd for omp_target_reduce
+template <typename T>
+class ReduceBitAnd<omp_target_reduce, T>
+    : public TargetReduce<RAJA::reduce::and_bit<T>, T>
+{
+public:
+
+  using self = ReduceBitAnd<omp_target_reduce, T>;
+  using parent = TargetReduce<RAJA::reduce::and_bit<T>, T>;
+  using parent::parent;
+
+  //! enable operator&= for ReduceBitAnd -- alias for reduce()
+  self &operator&=(T rhsVal)
+  {
+    parent::reduce(rhsVal);
+    return *this;
+  }
+
+  //! enable operator&= for ReduceBitAnd -- alias for reduce()
+  const self &operator&=(T rhsVal) const
+  {
+    parent::reduce(rhsVal);
+    return *this;
+  }
+};
 
 //! specialization of ReduceMin for omp_target_reduce
 template <typename T>
diff --git a/include/RAJA/policy/sequential.hpp b/include/RAJA/policy/sequential.hpp
index 7345d4a7c7..38b7b87a42 100644
--- a/include/RAJA/policy/sequential.hpp
+++ b/include/RAJA/policy/sequential.hpp
@@ -26,6 +26,7 @@
 #include "RAJA/policy/sequential/policy.hpp"
 #include "RAJA/policy/sequential/reduce.hpp"
 #include "RAJA/policy/sequential/scan.hpp"
-
+#include "RAJA/policy/sequential/sort.hpp"
+#include "RAJA/policy/sequential/WorkGroup.hpp"
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/WorkGroup.hpp b/include/RAJA/policy/sequential/WorkGroup.hpp
new file mode 100644
index 0000000000..084fc0b8aa
--- /dev/null
+++ b/include/RAJA/policy/sequential/WorkGroup.hpp
@@ -0,0 +1,24 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA Vtable and WorkRunner constructs.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_sequential_WorkGroup_HPP
+#define RAJA_sequential_WorkGroup_HPP
+
+#include "RAJA/policy/sequential/WorkGroup/Vtable.hpp"
+#include "RAJA/policy/sequential/WorkGroup/WorkRunner.hpp"
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/WorkGroup/Vtable.hpp b/include/RAJA/policy/sequential/WorkGroup/Vtable.hpp
new file mode 100644
index 0000000000..22838401e4
--- /dev/null
+++ b/include/RAJA/policy/sequential/WorkGroup/Vtable.hpp
@@ -0,0 +1,47 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA workgroup Vtable.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_sequential_WorkGroup_Vtable_HPP
+#define RAJA_sequential_WorkGroup_Vtable_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/sequential/policy.hpp"
+
+#include "RAJA/policy/loop/WorkGroup/Vtable.hpp"
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+* Populate and return a Vtable object
+*/
+template < typename T, typename Vtable_T >
+inline const Vtable_T* get_Vtable(seq_work const&)
+{
+  return get_Vtable<T, Vtable_T>(loop_work{});
+}
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
new file mode 100644
index 0000000000..38c58cb335
--- /dev/null
+++ b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
@@ -0,0 +1,82 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA WorkRunner class specializations.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_sequential_WorkGroup_WorkRunner_HPP
+#define RAJA_sequential_WorkGroup_WorkRunner_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/sequential/policy.hpp"
+
+#include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+ * Runs work in a storage container in order
+ * and returns any per run resources
+ */
+template <typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::seq_work,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallOrdered<
+        RAJA::seq_exec,
+        RAJA::seq_work,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{ };
+
+/*!
+ * Runs work in a storage container in reverse order
+ * and returns any per run resources
+ */
+template <typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::seq_work,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallReverse<
+        RAJA::seq_exec,
+        RAJA::seq_work,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{ };
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/forall.hpp b/include/RAJA/policy/sequential/forall.hpp
index a91a2d37d0..bd82fff6b5 100644
--- a/include/RAJA/policy/sequential/forall.hpp
+++ b/include/RAJA/policy/sequential/forall.hpp
@@ -33,6 +33,8 @@
 
 #include "RAJA/pattern/detail/forall.hpp"
 
+#include "RAJA/util/resource.hpp"
+
 namespace RAJA
 {
 namespace policy
@@ -52,7 +54,10 @@ namespace sequential
 //
 
 template <typename Iterable, typename Func>
-RAJA_INLINE void forall_impl(const seq_exec &, Iterable &&iter, Func &&body)
+RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(resources::Host &host_res,
+                                                               const seq_exec &,
+                                                               Iterable &&iter,
+                                                               Func &&body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 
@@ -60,6 +65,7 @@ RAJA_INLINE void forall_impl(const seq_exec &, Iterable &&iter, Func &&body)
   for (decltype(distance_it) i = 0; i < distance_it; ++i) {
     body(*(begin_it + i));
   }
+  return resources::EventProxy<resources::Host>(&host_res);
 }
 
 }  // namespace sequential
diff --git a/include/RAJA/policy/sequential/kernel/Collapse.hpp b/include/RAJA/policy/sequential/kernel/Collapse.hpp
index 90e46e89f1..5f36402637 100644
--- a/include/RAJA/policy/sequential/kernel/Collapse.hpp
+++ b/include/RAJA/policy/sequential/kernel/Collapse.hpp
@@ -30,15 +30,15 @@ namespace internal
 //
 // Termination case for seq_exec collapsed loops
 //
-template <typename... EnclosedStmts>
+template <typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::Collapse<seq_exec, ArgList<>, EnclosedStmts...>> {
+    statement::Collapse<seq_exec, ArgList<>, EnclosedStmts...>, Types> {
 
   template <typename Data>
   static RAJA_INLINE void exec(Data &data)
   {
     // termination case: no more loops, just execute enclosed statements
-    execute_statement_list<camp::list<EnclosedStmts...>>(data);
+    execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
   }
 };
 
@@ -47,17 +47,21 @@ struct StatementExecutor<
 // Executor that handles collapsing of an arbitrarily deep set of seq_exec
 // loops
 //
-template <camp::idx_t Arg0, camp::idx_t... ArgRest, typename... EnclosedStmts>
+template <camp::idx_t Arg0, camp::idx_t... ArgRest, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<statement::Collapse<seq_exec,
                                              ArgList<Arg0, ArgRest...>,
-                                             EnclosedStmts...>> {
+                                             EnclosedStmts...>, Types> {
 
   template <typename Data>
   static RAJA_INLINE void exec(Data &data)
   {
+
+    // Set the argument type for this loop
+    using NewTypes = setSegmentTypeFromData<Types, Arg0, Data>;
+
     // compute next-most inner loop Executor
     using next_loop_t = StatementExecutor<
-        statement::Collapse<seq_exec, ArgList<ArgRest...>, EnclosedStmts...>>;
+        statement::Collapse<seq_exec, ArgList<ArgRest...>, EnclosedStmts...>, NewTypes>;
 
     auto len0 = segment_length<Arg0>(data);
 
diff --git a/include/RAJA/policy/sequential/kernel/Reduce.hpp b/include/RAJA/policy/sequential/kernel/Reduce.hpp
index 82583ed802..ff74b239c9 100644
--- a/include/RAJA/policy/sequential/kernel/Reduce.hpp
+++ b/include/RAJA/policy/sequential/kernel/Reduce.hpp
@@ -31,9 +31,10 @@ namespace internal
 //
 template <template <typename...> class ReduceOperator,
           typename ParamId,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
-    statement::Reduce<seq_reduce, ReduceOperator, ParamId, EnclosedStmts...>> {
+    statement::Reduce<seq_reduce, ReduceOperator, ParamId, EnclosedStmts...>, Types> {
 
   template <typename Data>
   static RAJA_INLINE void exec(Data &&data)
@@ -41,7 +42,7 @@ struct StatementExecutor<
     // since a sequential reduction is a NOP, and the single thread always
     // has the reduced value, this is just a passthrough to the enclosed
     // statements
-    execute_statement_list<camp::list<EnclosedStmts...>>(data);
+    execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
   }
 };
 
diff --git a/include/RAJA/policy/sequential/policy.hpp b/include/RAJA/policy/sequential/policy.hpp
index 8430461359..b9c8250241 100644
--- a/include/RAJA/policy/sequential/policy.hpp
+++ b/include/RAJA/policy/sequential/policy.hpp
@@ -56,6 +56,15 @@ struct seq_exec : make_policy_pattern_launch_platform_t<Policy::sequential,
 ///
 using seq_segit = seq_exec;
 
+///
+/// WorkGroup execution policies
+///
+struct seq_work : make_policy_pattern_launch_platform_t<Policy::sequential,
+                                                        Pattern::workgroup_exec,
+                                                        Launch::sync,
+                                                        Platform::host> {
+};
+
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
@@ -75,6 +84,7 @@ using policy::sequential::seq_exec;
 using policy::sequential::seq_reduce;
 using policy::sequential::seq_region;
 using policy::sequential::seq_segit;
+using policy::sequential::seq_work;
 
 
 
diff --git a/include/RAJA/policy/sequential/scan.hpp b/include/RAJA/policy/sequential/scan.hpp
index 03316bfd0a..a71d692bac 100644
--- a/include/RAJA/policy/sequential/scan.hpp
+++ b/include/RAJA/policy/sequential/scan.hpp
@@ -44,11 +44,12 @@ template <typename ExecPolicy, typename Iter, typename BinFn>
 concepts::enable_if<type_traits::is_sequential_policy<ExecPolicy>>
 inclusive_inplace(const ExecPolicy &, Iter begin, Iter end, BinFn f)
 {
-  auto agg = *begin;
+  using ValueT = typename std::remove_reference<decltype(*begin)>::type;
+  ValueT agg = *begin;
 
   RAJA_NO_SIMD
   for (Iter i = ++begin; i != end; ++i) {
-    agg = f(*i, agg);
+    agg = f(agg, *i);
     *i = agg;
   }
 }
@@ -61,13 +62,17 @@ template <typename ExecPolicy, typename Iter, typename BinFn, typename T>
 concepts::enable_if<type_traits::is_sequential_policy<ExecPolicy>>
 exclusive_inplace(const ExecPolicy &, Iter begin, Iter end, BinFn f, T v)
 {
-  const int n = end - begin;
-  decltype(*begin) agg = v;
+  using std::distance;
+  const auto n = distance(begin, end);
+  using DistanceT = typename std::remove_const<decltype(n)>::type;
+
+  using ValueT = typename std::remove_reference<decltype(*begin)>::type;
+  ValueT agg = v;
 
   RAJA_NO_SIMD
-  for (int i = 0; i < n; ++i) {
-    auto t = *(begin + i);
-    *(begin + i) = agg;
+  for (DistanceT i = 0; i < n; ++i) {
+    auto t = begin[i];
+    begin[i] = agg;
     agg = f(agg, t);
   }
 }
@@ -84,7 +89,9 @@ concepts::enable_if<type_traits::is_sequential_policy<ExecPolicy>> inclusive(
     OutIter out,
     BinFn f)
 {
-  auto agg = *begin;
+  using ValueT = typename std::remove_reference<decltype(*out)>::type;
+  ValueT agg = *begin;
+
   *out++ = agg;
 
   RAJA_NO_SIMD
@@ -111,13 +118,14 @@ concepts::enable_if<type_traits::is_sequential_policy<ExecPolicy>> exclusive(
     BinFn f,
     T v)
 {
-  decltype(*begin) agg = v;
+  using ValueT = typename std::remove_const<decltype(*begin)>::type;
+  ValueT agg = v;
   OutIter o = out;
   *o++ = v;
 
   RAJA_NO_SIMD
   for (Iter i = begin; i != end - 1; ++i, ++o) {
-    agg = f(*i, agg);
+    agg = f(agg, *i);
     *o = agg;
   }
 }
diff --git a/include/RAJA/policy/sequential/sort.hpp b/include/RAJA/policy/sequential/sort.hpp
new file mode 100644
index 0000000000..9512503c27
--- /dev/null
+++ b/include/RAJA/policy/sequential/sort.hpp
@@ -0,0 +1,101 @@
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file providing RAJA sort declarations.
+*
+******************************************************************************
+*/
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_sort_sequential_HPP
+#define RAJA_sort_sequential_HPP
+
+#include "RAJA/config.hpp"
+
+#include <algorithm>
+#include <functional>
+#include <iterator>
+
+#include "RAJA/util/macros.hpp"
+
+#include "RAJA/util/concepts.hpp"
+
+#include "RAJA/policy/sequential/policy.hpp"
+#include "RAJA/policy/loop/sort.hpp"
+
+namespace RAJA
+{
+namespace impl
+{
+namespace sort
+{
+
+/*!
+        \brief sort given range using comparison function
+*/
+template <typename ExecPolicy, typename Iter, typename Compare>
+concepts::enable_if<type_traits::is_sequential_policy<ExecPolicy>>
+unstable(const ExecPolicy&,
+         Iter begin,
+         Iter end,
+         Compare comp)
+{
+  RAJA::impl::sort::unstable(::RAJA::loop_exec{}, begin, end, comp);
+}
+
+/*!
+        \brief stable sort given range using comparison function
+*/
+template <typename ExecPolicy, typename Iter, typename Compare>
+concepts::enable_if<type_traits::is_sequential_policy<ExecPolicy>>
+stable(const ExecPolicy&,
+            Iter begin,
+            Iter end,
+            Compare comp)
+{
+  RAJA::impl::sort::stable(::RAJA::loop_exec{}, begin, end, comp);
+}
+
+/*!
+        \brief sort given range of pairs using comparison function on keys
+*/
+template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+concepts::enable_if<type_traits::is_sequential_policy<ExecPolicy>>
+unstable_pairs(const ExecPolicy&,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               Compare comp)
+{
+  RAJA::impl::sort::unstable_pairs(::RAJA::loop_exec{}, keys_begin, keys_end, vals_begin, comp);
+}
+
+/*!
+        \brief stable sort given range of pairs using comparison function on keys
+*/
+template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+concepts::enable_if<type_traits::is_sequential_policy<ExecPolicy>>
+stable_pairs(const ExecPolicy&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             Compare comp)
+{
+  RAJA::impl::sort::stable_pairs(::RAJA::loop_exec{}, keys_begin, keys_end, vals_begin, comp);
+}
+
+}  // namespace sort
+
+}  // namespace impl
+
+}  // namespace RAJA
+
+#endif
diff --git a/include/RAJA/policy/simd/forall.hpp b/include/RAJA/policy/simd/forall.hpp
index 411126b060..efce422e4b 100644
--- a/include/RAJA/policy/simd/forall.hpp
+++ b/include/RAJA/policy/simd/forall.hpp
@@ -46,9 +46,10 @@ namespace simd
 
 
 template <typename Iterable, typename Func>
-RAJA_INLINE void forall_impl(const simd_exec &,
-                             Iterable &&iter,
-                             Func &&loop_body)
+RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(RAJA::resources::Host &host_res,
+                                                               const simd_exec &,
+                                                               Iterable &&iter,
+                                                               Func &&loop_body)
 {
   auto begin = std::begin(iter);
   auto end = std::end(iter);
@@ -57,6 +58,8 @@ RAJA_INLINE void forall_impl(const simd_exec &,
   for (decltype(distance) i = 0; i < distance; ++i) {
     loop_body(*(begin + i));
   }
+
+  return RAJA::resources::EventProxy<resources::Host>(&host_res);
 }
 
 }  // namespace simd
diff --git a/include/RAJA/policy/simd/kernel/For.hpp b/include/RAJA/policy/simd/kernel/For.hpp
index ceb6b402e0..39082abbad 100644
--- a/include/RAJA/policy/simd/kernel/For.hpp
+++ b/include/RAJA/policy/simd/kernel/For.hpp
@@ -44,8 +44,8 @@ struct TypeIsLambda {
   static const bool value = false;
 };
 
-template <camp::idx_t BodyIdx>
-struct TypeIsLambda<RAJA::statement::Lambda<BodyIdx>> {
+template <camp::idx_t BodyIdx, typename ... Args>
+struct TypeIsLambda<RAJA::statement::Lambda<BodyIdx, Args...>> {
   static const bool value = true;
 };
 
@@ -54,73 +54,38 @@ struct TypeIsLambda<RAJA::statement::Lambda<BodyIdx>> {
  *  Helper structs to invoke a chain of lambdas
  *
  */
-template <camp::idx_t LoopIdx, class... States>
-struct Invoke_all_Lambda {
-
-  template <camp::idx_t... OffsetIdx,
-            camp::idx_t... ParamIdx,
-            typename Data,
-            typename Offs,
-            typename Params>
-  static RAJA_INLINE void lambda_special(camp::idx_seq<OffsetIdx...> const &,
-                                         camp::idx_seq<ParamIdx...> const &,
-                                         Data &,
-                                         Offs const &,
-                                         Params const &)
-  {
-  }
-};
 
-template <camp::idx_t LoopIdx>
-struct Invoke_all_Lambda<LoopIdx> {
+template <typename Types, class... Statements>
+struct Invoke_all_Lambda;
 
-  static const bool value = true;
+template <typename Types>
+struct Invoke_all_Lambda<Types> {
 
-  template <camp::idx_t... OffsetIdx,
-            camp::idx_t... ParamIdx,
-            typename Data,
-            typename Offs,
-            typename Params>
-  static RAJA_INLINE void lambda_special(camp::idx_seq<OffsetIdx...> const &,
-                                         camp::idx_seq<ParamIdx...> const &,
-                                         Data &,
-                                         Offs const &,
-                                         Params const &)
+  template <typename Data>
+  static RAJA_INLINE void lambda_special(Data &&)
   {
+    // NOP terminator
   }
 };
 
-template <camp::idx_t LoopIdx, class State, class... States>
-struct Invoke_all_Lambda<LoopIdx, State, States...>
-    : Invoke_all_Lambda<LoopIdx, States...> {
+
+template <typename Types, class Statement, class... StatementRest>
+struct Invoke_all_Lambda<Types, Statement, StatementRest...> {
 
   // Lambda check
-  static const bool value = TypeIsLambda<camp::decay<State>>::value;
+  static const bool value = TypeIsLambda<camp::decay<Statement>>::value;
   static_assert(value, "Lambdas are only supported post RAJA::simd_exec");
 
   // Invoke the chain of lambdas
-  template <camp::idx_t... OffsetIdx,
-            camp::idx_t... ParamIdx,
-            typename Data,
-            typename Offs,
-            typename Params>
-  static RAJA_INLINE void lambda_special(camp::idx_seq<OffsetIdx...> const &,
-                                         camp::idx_seq<ParamIdx...> const &,
-                                         Data &data,
-                                         Offs const &offset_tuple,
-                                         Params const &params)
+  template <typename Data>
+  static RAJA_INLINE void lambda_special(Data &&data)
   {
-    camp::get<LoopIdx>(
-        data.bodies)((camp::get<OffsetIdx>(data.segment_tuple)
-                          .begin()[camp::get<OffsetIdx>(offset_tuple)])...,
-                     camp::get<ParamIdx>(data.param_tuple)...);
-
-    Invoke_all_Lambda<LoopIdx + 1, States...>::lambda_special(
-        camp::idx_seq_from_t<decltype(offset_tuple)>{},
-        camp::idx_seq_from_t<decltype(params)>{},
-        data,
-        offset_tuple,
-        params);
+
+    // Execute this Lambda
+    StatementExecutor<Statement, Types>::exec(data);
+
+    // Execute next Lambda
+    Invoke_all_Lambda<Types, StatementRest...>::lambda_special(data);
   }
 };
 
@@ -131,14 +96,17 @@ struct Invoke_all_Lambda<LoopIdx, State, States...>
  * only one lambda is used, no reductions are done within the lambda.
  * Assigns the loop index to offset ArgumentId
  */
-template <camp::idx_t ArgumentId, typename... EnclosedStmts>
+template <camp::idx_t ArgumentId, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::For<ArgumentId, RAJA::simd_exec, EnclosedStmts...>> {
+    statement::For<ArgumentId, RAJA::simd_exec, EnclosedStmts...>, Types> {
 
   template <typename Data>
   static RAJA_INLINE void exec(Data &&data)
   {
 
+    // Set the argument type for this loop
+    using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
     auto iter = get<ArgumentId>(data.segment_tuple);
     auto begin = std::begin(iter);
     auto end = std::end(iter);
@@ -147,17 +115,15 @@ struct StatementExecutor<
     RAJA_SIMD
     for (decltype(distance) i = 0; i < distance; ++i) {
 
-      // Offsets and parameters need to be privatized
-      auto offsets = data.offset_tuple;
-      auto params = data.param_tuple;
-      get<ArgumentId>(offsets) = i;
-
-      Invoke_all_Lambda<0, EnclosedStmts...>::lambda_special(
-          camp::idx_seq_from_t<decltype(offsets)>{},
-          camp::idx_seq_from_t<decltype(params)>{},
-          data,
-          offsets,
-          params);
+      // Privatize data for SIMD correctness reasons
+      using RAJA::internal::thread_privatize;
+      auto privatizer = thread_privatize(data);
+      auto& private_data = privatizer.get_priv();
+
+      // Assign offset on privatized data
+      private_data.template assign_offset<ArgumentId>(i);
+
+      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(private_data);
     }
   }
 };
diff --git a/include/RAJA/policy/simd/kernel/ForICount.hpp b/include/RAJA/policy/simd/kernel/ForICount.hpp
index 2af922f040..f0c3f44661 100644
--- a/include/RAJA/policy/simd/kernel/ForICount.hpp
+++ b/include/RAJA/policy/simd/kernel/ForICount.hpp
@@ -43,15 +43,18 @@ namespace internal
  * Assigns the loop index to param ParamId
  */
 template <camp::idx_t ArgumentId, typename ParamId,
-          typename... EnclosedStmts>
+          typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
     statement::ForICount<ArgumentId, ParamId, RAJA::simd_exec,
-                         EnclosedStmts...>> {
+                         EnclosedStmts...>, Types> {
 
   template <typename Data>
   static RAJA_INLINE void exec(Data &&data)
   {
 
+    // Set the argument type for this loop
+    using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
     auto iter = get<ArgumentId>(data.segment_tuple);
     auto begin = std::begin(iter);
     auto end = std::end(iter);
@@ -64,14 +67,12 @@ struct StatementExecutor<
       data.template assign_offset<ArgumentId>(i);
       data.template assign_param<ParamId>(i);
 
-      auto offsets = data.offset_tuple;
-      auto params = data.param_tuple;
-      Invoke_all_Lambda<0, EnclosedStmts...>::lambda_special(
-          camp::idx_seq_from_t<decltype(offsets)>{},
-          camp::idx_seq_from_t<decltype(params)>{},
-          data,
-          offsets,
-          params);
+      // Privatize data for SIMD correctness reasons
+      using RAJA::internal::thread_privatize;
+      auto privatizer = thread_privatize(data);
+      auto& private_data = privatizer.get_priv();
+
+      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(private_data);
     }
   }
 };
diff --git a/include/RAJA/policy/tbb.hpp b/include/RAJA/policy/tbb.hpp
index ab142cbff0..04d1ad1f37 100644
--- a/include/RAJA/policy/tbb.hpp
+++ b/include/RAJA/policy/tbb.hpp
@@ -26,6 +26,8 @@
 #include "RAJA/policy/tbb/policy.hpp"
 #include "RAJA/policy/tbb/reduce.hpp"
 #include "RAJA/policy/tbb/scan.hpp"
+#include "RAJA/policy/tbb/sort.hpp"
+#include "RAJA/policy/tbb/WorkGroup.hpp"
 
 #endif
 
diff --git a/include/RAJA/policy/tbb/WorkGroup.hpp b/include/RAJA/policy/tbb/WorkGroup.hpp
new file mode 100644
index 0000000000..5725ef02ba
--- /dev/null
+++ b/include/RAJA/policy/tbb/WorkGroup.hpp
@@ -0,0 +1,25 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA Vtable and WorkRunner constructs.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_tbb_WorkGroup_HPP
+#define RAJA_tbb_WorkGroup_HPP
+
+#include "RAJA/policy/tbb/WorkGroup/Vtable.hpp"
+#include "RAJA/policy/tbb/WorkGroup/WorkRunner.hpp"
+
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/tbb/WorkGroup/Vtable.hpp b/include/RAJA/policy/tbb/WorkGroup/Vtable.hpp
new file mode 100644
index 0000000000..512d0fb2ee
--- /dev/null
+++ b/include/RAJA/policy/tbb/WorkGroup/Vtable.hpp
@@ -0,0 +1,47 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA workgroup Vtable.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_tbb_WorkGroup_Vtable_HPP
+#define RAJA_tbb_WorkGroup_Vtable_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/tbb/policy.hpp"
+
+#include "RAJA/policy/loop/WorkGroup/Vtable.hpp"
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+* Populate and return a Vtable object
+*/
+template < typename T, typename Vtable_T >
+inline const Vtable_T* get_Vtable(tbb_work const&)
+{
+  return get_Vtable<T, Vtable_T>(loop_work{});
+}
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/tbb/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/tbb/WorkGroup/WorkRunner.hpp
new file mode 100644
index 0000000000..1bf0c0b459
--- /dev/null
+++ b/include/RAJA/policy/tbb/WorkGroup/WorkRunner.hpp
@@ -0,0 +1,82 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA WorkRunner class specializations.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_tbb_WorkGroup_WorkRunner_HPP
+#define RAJA_tbb_WorkGroup_WorkRunner_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/tbb/policy.hpp"
+
+#include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+ * Runs work in a storage container in order
+ * and returns any per run resources
+ */
+template <typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::tbb_work,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallOrdered<
+        RAJA::tbb_for_exec,
+        RAJA::tbb_work,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{ };
+
+/*!
+ * Runs work in a storage container in reverse order
+ * and returns any per run resources
+ */
+template <typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::tbb_work,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallReverse<
+        RAJA::tbb_for_exec,
+        RAJA::tbb_work,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{ };
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/tbb/forall.hpp b/include/RAJA/policy/tbb/forall.hpp
index 7613d1da31..34cfc32853 100644
--- a/include/RAJA/policy/tbb/forall.hpp
+++ b/include/RAJA/policy/tbb/forall.hpp
@@ -27,17 +27,13 @@
 
 #include <tbb/tbb.h>
 
-#include "RAJA/util/types.hpp"
-
-#include "RAJA/policy/tbb/policy.hpp"
-
 #include "RAJA/index/IndexSet.hpp"
 #include "RAJA/index/ListSegment.hpp"
 #include "RAJA/index/RangeSegment.hpp"
-
 #include "RAJA/internal/fault_tolerance.hpp"
-
 #include "RAJA/pattern/forall.hpp"
+#include "RAJA/policy/tbb/policy.hpp"
+#include "RAJA/util/types.hpp"
 
 
 namespace RAJA
@@ -71,24 +67,29 @@ namespace tbb
  * argument.  This should be used for composable parallelism and increased work
  * stealing at the cost of initial start-up overhead for a top-level loop.
  */
+
 template <typename Iterable, typename Func>
-RAJA_INLINE void forall_impl(const tbb_for_dynamic& p,
-                             Iterable&& iter,
-                             Func&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(resources::Host &host_res,
+                                                               const tbb_for_dynamic& p,
+                                                               Iterable&& iter,
+                                                               Func&& loop_body)
 {
   using std::begin;
+  using std::distance;
   using std::end;
-  using brange = ::tbb::blocked_range<decltype(iter.begin())>;
-  ::tbb::parallel_for(brange(begin(iter), end(iter), p.grain_size),
-                      [=](const brange& r) {
-                        using RAJA::internal::thread_privatize;
-                        auto privatizer = thread_privatize(loop_body);
-                        auto body = privatizer.get_priv();
-                        for (const auto& i : r)
-                          body(i);
-                      });
+  using brange = ::tbb::blocked_range<size_t>;
+  auto b = begin(iter);
+  size_t dist = std::abs(distance(begin(iter), end(iter)));
+  ::tbb::parallel_for(brange(0, dist, p.grain_size), [=](const brange& r) {
+    using RAJA::internal::thread_privatize;
+    auto privatizer = thread_privatize(loop_body);
+    auto body = privatizer.get_priv();
+    for (auto i = r.begin(); i != r.end(); ++i)
+      body(b[i]);
+  });
+
+  return resources::EventProxy<resources::Host>(&host_res);
 }
-
 ///
 /// TBB parallel for static policy implementation
 ///
@@ -109,23 +110,31 @@ RAJA_INLINE void forall_impl(const tbb_for_dynamic& p,
  * threads must be maintained across multiple loops for correctness. NOTE: if
  * correctnes requires the per-thread mapping, you *must* use TBB 2017 or newer
  */
+
 template <typename Iterable, typename Func, size_t ChunkSize>
-RAJA_INLINE void forall_impl(const tbb_for_static<ChunkSize>&,
-                             Iterable&& iter,
-                             Func&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(resources::Host &host_res,
+                                                               const tbb_for_static<ChunkSize>&,
+                                                               Iterable&& iter,
+                                                               Func&& loop_body)
 {
   using std::begin;
+  using std::distance;
   using std::end;
-  using brange = ::tbb::blocked_range<decltype(iter.begin())>;
-  ::tbb::parallel_for(brange(begin(iter), end(iter), ChunkSize),
-                      [=](const brange& r) {
-                        using RAJA::internal::thread_privatize;
-                        auto privatizer = thread_privatize(loop_body);
-                        auto body = privatizer.get_priv();
-                        for (const auto& i : r)
-                          body(i);
-                      },
-                      tbb_static_partitioner{});
+  using brange = ::tbb::blocked_range<size_t>;
+  auto b = begin(iter);
+  size_t dist = std::abs(distance(begin(iter), end(iter)));
+  ::tbb::parallel_for(
+      brange(0, dist, ChunkSize),
+      [=](const brange& r) {
+        using RAJA::internal::thread_privatize;
+        auto privatizer = thread_privatize(loop_body);
+        auto body = privatizer.get_priv();
+        for (auto i = r.begin(); i != r.end(); ++i)
+          body(b[i]);
+      },
+      tbb_static_partitioner{});
+
+  return resources::EventProxy<resources::Host>(&host_res);
 }
 
 }  // namespace tbb
diff --git a/include/RAJA/policy/tbb/policy.hpp b/include/RAJA/policy/tbb/policy.hpp
index 3650985290..1210cd284d 100644
--- a/include/RAJA/policy/tbb/policy.hpp
+++ b/include/RAJA/policy/tbb/policy.hpp
@@ -65,6 +65,15 @@ using tbb_for_exec = tbb_for_static<>;
 ///
 using tbb_segit = tbb_for_exec;
 
+///
+/// WorkGroup execution policies
+///
+struct tbb_work : make_policy_pattern_launch_platform_t<Policy::tbb,
+                                                        Pattern::workgroup_exec,
+                                                        Launch::sync,
+                                                        Platform::host> {
+};
+
 
 ///
 ///////////////////////////////////////////////////////////////////////
@@ -87,6 +96,7 @@ using policy::tbb::tbb_for_exec;
 using policy::tbb::tbb_for_static;
 using policy::tbb::tbb_reduce;
 using policy::tbb::tbb_segit;
+using policy::tbb::tbb_work;
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/tbb/sort.hpp b/include/RAJA/policy/tbb/sort.hpp
new file mode 100644
index 0000000000..c23e8560b4
--- /dev/null
+++ b/include/RAJA/policy/tbb/sort.hpp
@@ -0,0 +1,200 @@
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file providing RAJA sort declarations.
+*
+******************************************************************************
+*/
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_sort_tbb_HPP
+#define RAJA_sort_tbb_HPP
+
+#include "RAJA/config.hpp"
+
+#include <algorithm>
+#include <functional>
+#include <iterator>
+
+#include <tbb/tbb.h>
+
+#include "RAJA/util/macros.hpp"
+
+#include "RAJA/util/concepts.hpp"
+
+#include "RAJA/policy/tbb/policy.hpp"
+#include "RAJA/policy/loop/sort.hpp"
+#include "RAJA/pattern/detail/algorithm.hpp"
+
+namespace RAJA
+{
+namespace impl
+{
+namespace sort
+{
+
+namespace detail
+{
+
+/*!
+        \brief sort given range using sorter and comparison function
+               by spawning tasks
+*/
+template < typename Sorter, typename Iter, typename Compare >
+struct TbbSortTask : tbb::task
+{
+  using diff_type =
+      camp::decay<decltype(camp::val<Iter>() - camp::val<Iter>())>;
+
+  // TODO: make this less arbitrary
+  static const diff_type cutoff = 256;
+
+  Sorter sorter;
+  const Iter begin;
+  const Iter end;
+  Compare comp;
+
+  TbbSortTask(Sorter sorter_, Iter begin_, Iter end_, Compare comp_)
+    : sorter(sorter_)
+    , begin(begin_)
+    , end(end_)
+    , comp(comp_)
+  { }
+
+  tbb::task* execute()
+  {
+    diff_type len = end - begin;
+
+    if (len <= cutoff) {
+
+      // leaves sort their range
+      sorter(begin, end, comp);
+
+    } else {
+
+      Iter middle = begin + (len/2);
+
+      // branching nodes break the sorting up recursively
+      TbbSortTask& sort_tank_front =
+          *new( allocate_child() ) TbbSortTask(sorter, begin, middle, comp);
+      TbbSortTask& sort_tank_back =
+          *new( allocate_child() ) TbbSortTask(sorter, middle, end, comp);
+
+      set_ref_count(3);
+      spawn(sort_tank_back);
+      spawn_and_wait_for_all(sort_tank_front);
+
+      // and merge the results
+      RAJA::detail::inplace_merge(begin, middle, end, comp);
+      //std::inplace_merge(begin, middle, end, comp);
+    }
+
+    return nullptr;
+  }
+};
+
+/*!
+        \brief sort given range using sorter and comparison function
+*/
+template <typename Sorter, typename Iter, typename Compare>
+inline
+void tbb_sort(Sorter sorter,
+              Iter begin,
+              Iter end,
+              Compare comp)
+{
+  using diff_type = RAJA::detail::IterDiff<Iter>;
+  using SortTask = TbbSortTask<Sorter, Iter, Compare>;
+
+  diff_type n = end - begin;
+
+  if (n <= SortTask::cutoff) {
+
+    sorter(begin, end, comp);
+
+  } else {
+
+    SortTask& sort_task =
+        *new(tbb::task::allocate_root()) SortTask(sorter, begin, end, comp);
+    tbb::task::spawn_root_and_wait(sort_task);
+
+  }
+}
+
+} // namespace detail
+
+/*!
+        \brief sort given range using comparison function
+*/
+template <typename ExecPolicy, typename Iter, typename Compare>
+concepts::enable_if<type_traits::is_tbb_policy<ExecPolicy>>
+unstable(const ExecPolicy&,
+         Iter begin,
+         Iter end,
+         Compare comp)
+{
+  tbb::parallel_sort(begin, end, comp);
+}
+
+/*!
+        \brief stable sort given range using comparison function
+*/
+template <typename ExecPolicy, typename Iter, typename Compare>
+concepts::enable_if<type_traits::is_tbb_policy<ExecPolicy>>
+stable(const ExecPolicy&,
+       Iter begin,
+       Iter end,
+       Compare comp)
+{
+  detail::tbb_sort(detail::StableSorter{}, begin, end, comp);
+}
+
+/*!
+        \brief sort given range of pairs using comparison function on keys
+*/
+template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+concepts::enable_if<type_traits::is_tbb_policy<ExecPolicy>>
+unstable_pairs(const ExecPolicy&,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               Compare comp)
+{
+  auto begin  = RAJA::zip(keys_begin, vals_begin);
+  auto end    = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+  detail::tbb_sort(detail::UnstableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
+}
+
+/*!
+        \brief stable sort given range of pairs using comparison function on keys
+*/
+template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+concepts::enable_if<type_traits::is_tbb_policy<ExecPolicy>>
+stable_pairs(const ExecPolicy&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             Compare comp)
+{
+  auto begin  = RAJA::zip(keys_begin, vals_begin);
+  auto end    = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+  detail::tbb_sort(detail::StableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
+}
+
+}  // namespace sort
+
+}  // namespace impl
+
+}  // namespace RAJA
+
+#endif
diff --git a/include/RAJA/util/KokkosPluginLoader.hpp b/include/RAJA/util/KokkosPluginLoader.hpp
new file mode 100644
index 0000000000..ff1d46d90b
--- /dev/null
+++ b/include/RAJA/util/KokkosPluginLoader.hpp
@@ -0,0 +1,54 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_Kokkos_Plugin_Loader_HPP
+#define RAJA_Kokkos_Plugin_Loader_HPP
+
+#include <memory>
+#include <vector>
+
+#include "RAJA/util/PluginOptions.hpp"
+#include "RAJA/util/PluginStrategy.hpp"
+
+namespace RAJA {
+namespace util {
+
+  class KokkosPluginLoader : public ::RAJA::util::PluginStrategy
+  {
+  public:
+    using Parent = ::RAJA::util::PluginStrategy;
+    typedef void (*init_function)(const int, const uint64_t, const uint32_t, void*);
+    typedef void (*pre_function)(const char*, const uint32_t, uint64_t*);
+    typedef void (*post_function)(uint64_t);
+    typedef void (*finalize_function)();
+
+    KokkosPluginLoader();
+
+    void preLaunch(const RAJA::util::PluginContext& p) override;
+
+    void postLaunch(const RAJA::util::PluginContext& p) override;
+
+    void finalize() override;
+
+  private:
+    void initPlugin(const std::string &path);
+    
+    void initDirectory(const std::string &path);
+
+    std::vector<init_function> init_functions;
+    std::vector<pre_function> pre_functions;
+    std::vector<post_function> post_functions;
+    std::vector<finalize_function> finalize_functions;
+
+  };  // end KokkosPluginLoader class
+
+  void linkKokkosPluginLoader();
+
+}  // end namespace util
+}  // end namespace RAJA
+
+#endif
diff --git a/include/RAJA/util/Layout.hpp b/include/RAJA/util/Layout.hpp
index 666968ee20..2504ebf6aa 100644
--- a/include/RAJA/util/Layout.hpp
+++ b/include/RAJA/util/Layout.hpp
@@ -26,7 +26,7 @@
 
 #include "RAJA/index/IndexValue.hpp"
 
-#include "RAJA/internal/LegacyCompatibility.hpp"
+#include "RAJA/internal/foldl.hpp"
 
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/Permutations.hpp"
@@ -103,7 +103,7 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
   using IndexRange = camp::make_idx_seq_t<sizeof...(RangeInts)>;
 
   static constexpr size_t n_dims = sizeof...(RangeInts);
-  static constexpr size_t limit = RAJA::operators::limits<IdxLin>::max();
+  static constexpr IdxLin limit = RAJA::operators::limits<IdxLin>::max();
   static constexpr ptrdiff_t stride1_dim = StrideOneDim;
 
   // const char *index_types[sizeof...(RangeInts)];
@@ -129,10 +129,10 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
   RAJA_INLINE RAJA_HOST_DEVICE constexpr LayoutBase_impl(Types... ns)
       : sizes{static_cast<IdxLin>(stripIndexType(ns))...},
         strides{(detail::stride_calculator<RangeInts + 1, n_dims, IdxLin>{}(
-            sizes[RangeInts] ? 1 : 0,
+            sizes[RangeInts] ? IdxLin(1) : IdxLin(0),
             sizes))...},
-        inv_strides{(strides[RangeInts] ? strides[RangeInts] : 1)...},
-        inv_mods{(sizes[RangeInts] ? sizes[RangeInts] : 1)...}
+        inv_strides{(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
+        inv_mods{(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
   {
     static_assert(n_dims == sizeof...(Types),
                   "number of dimensions must match");
@@ -162,8 +162,8 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
       const std::array<IdxLin, n_dims> &strides_in)
       : sizes{sizes_in[RangeInts]...},
         strides{strides_in[RangeInts]...},
-        inv_strides{(strides[RangeInts] ? strides[RangeInts] : 1)...},
-        inv_mods{(sizes[RangeInts] ? sizes[RangeInts] : 1)...}
+        inv_strides{(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
+        inv_mods{(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
   {
   }
 
@@ -175,7 +175,7 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
   {
     printf("Error at index %d, value %ld is not within bounds [0, %ld] \n",
            static_cast<int>(N), static_cast<long int>(idx), static_cast<long int>(sizes[N] - 1));
-    RAJA_ASSERT(0 <= idx && idx < (sizes[N]) && "Layout index out of bounds \n");
+    RAJA_ABORT_OR_THROW("Out of bounds error \n");
   }
 
   template <camp::idx_t N>
@@ -186,7 +186,10 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
   template <camp::idx_t N, typename Idx, typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx, Indices... indices) const
   {
-    if(!(0<=idx && idx < sizes[N])) BoundsCheckError<N>(idx);
+    if(sizes[N] > 0 && !(0<=idx && idx < static_cast<Idx>(sizes[N])))
+    {
+      BoundsCheckError<N>(idx);
+    }
     RAJA_UNUSED_VAR(idx);
     BoundsCheck<N+1>(indices...);
   }
@@ -209,10 +212,10 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
     // dot product of strides and indices
 #ifdef RAJA_COMPILER_INTEL
     // Intel compiler has issues with Condition
-    return VarOps::sum<IdxLin>((indices * strides[RangeInts])...);
+    return sum<IdxLin>((indices * strides[RangeInts])...);
 #else
-    return VarOps::sum<IdxLin>
-      (((IdxLin) detail::ConditionalMultiply<RangeInts, stride1_dim>::multiply(indices, strides[RangeInts]) )...);
+    return sum<IdxLin>
+      (((IdxLin) detail::ConditionalMultiply<RangeInts, stride1_dim>::multiply(IdxLin(indices), strides[RangeInts]) )...);
 #endif
   }
 
@@ -232,17 +235,17 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
                                               Indices &&... indices) const
   {
 #if defined(RAJA_BOUNDS_CHECK_INTERNAL)
-    RAJA::Index_type totSize{1};
+    IdxLin totSize{1};
     for(size_t i=0; i<n_dims; ++i) {totSize *= sizes[i];};
-    if(linear_index < 0 || linear_index >= totSize) {
+    if(totSize > 0 && (linear_index < 0 || linear_index >= totSize)) {
       printf("Error! Linear index %ld is not within bounds [0, %ld]. \n",
              static_cast<long int>(linear_index), static_cast<long int>(totSize-1));
-      RAJA_ASSERT(linear_index < 0 || linear_index >= totSize);
+      RAJA_ABORT_OR_THROW("Out of bounds error \n");
      }
 #endif
 
-    VarOps::ignore_args((indices = (linear_index / inv_strides[RangeInts]) %
-                                   inv_mods[RangeInts])...);
+    camp::sink((indices = (camp::decay<Indices>)((linear_index / inv_strides[RangeInts]) %
+                                   inv_mods[RangeInts]))...);
   }
 
   /*!
@@ -255,8 +258,8 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
   {
     // Multiply together all of the sizes,
     // replacing 1 for any zero-sized dimensions
-    return VarOps::foldl(RAJA::operators::multiplies<IdxLin>(),
-                         (sizes[RangeInts] == 0 ? 1 : sizes[RangeInts])...);
+    return foldl(RAJA::operators::multiplies<IdxLin>(),
+                         (sizes[RangeInts] == IdxLin(0) ? IdxLin(1) : sizes[RangeInts])...);
   }
 };
 
@@ -264,7 +267,7 @@ template <camp::idx_t... RangeInts, typename IdxLin, ptrdiff_t StrideOneDim>
 constexpr size_t
     LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>::n_dims;
 template <camp::idx_t... RangeInts, typename IdxLin, ptrdiff_t StrideOneDim>
-constexpr size_t
+constexpr IdxLin
     LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>::limit;
 }  // namespace detail
 
@@ -326,10 +329,12 @@ struct TypedLayout;
 
 template <typename IdxLin, typename... DimTypes, ptrdiff_t StrideOne>
 struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
-    : public Layout<sizeof...(DimTypes), Index_type, StrideOne> {
+    : public Layout<sizeof...(DimTypes), strip_index_type_t<IdxLin>, StrideOne> {
+
+  using StrippedIdxLin = strip_index_type_t<IdxLin>;
   using Self = TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>;
-  using Base = Layout<sizeof...(DimTypes), Index_type, StrideOne>;
-  using DimArr = std::array<Index_type, sizeof...(DimTypes)>;
+  using Base = Layout<sizeof...(DimTypes), StrippedIdxLin, StrideOne>;
+  using DimArr = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
 
   // Pull in base constructors
   using Base::Base;
@@ -382,7 +387,7 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
   {
     Index_type locals[sizeof...(DimTypes)];
     Base::toIndices(stripIndexType(linear_index), locals[RangeInts]...);
-    VarOps::ignore_args((indices = Indices{locals[RangeInts]})...);
+		camp::sink((indices = Indices{static_cast<Indices>(locals[RangeInts])})...);
   }
 };
 
diff --git a/include/RAJA/util/OffsetLayout.hpp b/include/RAJA/util/OffsetLayout.hpp
index 5793583e9a..a835ec42d9 100644
--- a/include/RAJA/util/OffsetLayout.hpp
+++ b/include/RAJA/util/OffsetLayout.hpp
@@ -28,8 +28,6 @@
 
 #include "RAJA/index/IndexValue.hpp"
 
-#include "RAJA/internal/LegacyCompatibility.hpp"
-
 #include "RAJA/util/Permutations.hpp"
 #include "RAJA/util/PermutedLayout.hpp"
 
@@ -46,7 +44,7 @@ template <camp::idx_t... RangeInts, typename IdxLin>
 struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
   using Self = OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>;
   using IndexRange = camp::idx_seq<RangeInts...>;
-  using Base = detail::LayoutBase_impl<IndexRange, IdxLin>;
+  using Base = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
   Base base_;
 
   static constexpr size_t n_dims = sizeof...(RangeInts);
@@ -76,7 +74,7 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
     printf("Error at index %d, value %ld is not within bounds [%ld, %ld] \n",
            static_cast<int>(N), static_cast<long int>(idx),
            static_cast<long int>(offsets[N]), static_cast<long int>(offsets[N] + base_.sizes[N] - 1));
-    RAJA_ASSERT(offsets[N] <= idx && idx < (offsets[N] + base_.sizes[N]) && "Layout index out of bounds \n");
+    RAJA_ABORT_OR_THROW("Out of bounds error \n");
   }
 
   template <camp::idx_t N>
@@ -87,7 +85,10 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
   template <camp::idx_t N, typename Idx, typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx, Indices... indices) const
   {
-    if(!(0<=idx && idx < base_.sizes[N])) BoundsCheckError<N>(idx);
+    if(!(offsets[N] <=idx && idx < offsets[N] + base_.sizes[N]))
+    {
+      BoundsCheckError<N>(idx);
+    }
     RAJA_UNUSED_VAR(idx);
     BoundsCheck<N+1>(indices...);
   }
@@ -108,7 +109,7 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
       const Layout<sizeof...(RangeInts), IdxLin>& rhs)
   {
     OffsetLayout_impl ret{rhs};
-    VarOps::ignore_args((ret.offsets[RangeInts] = offsets_in[RangeInts])...);
+    camp::sink((ret.offsets[RangeInts] = offsets_in[RangeInts])...);
     return ret;
   }
 
@@ -151,7 +152,12 @@ struct TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>
    using DimArr = std::array<Index_type, sizeof...(DimTypes)>;
 
    // Pull in base coonstructors
-   using Base::Base;
+ #if 0
+   // This breaks with nvcc11
+ using Base::Base;
+ #else
+   using OffsetLayout<sizeof...(DimTypes), Index_type>::OffsetLayout;
+ #endif
 
   RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(DimTypes... indices) const
   {
diff --git a/include/RAJA/util/Operators.hpp b/include/RAJA/util/Operators.hpp
index b12ed76275..e49543d72c 100644
--- a/include/RAJA/util/Operators.hpp
+++ b/include/RAJA/util/Operators.hpp
@@ -69,13 +69,13 @@ namespace types
 
 template <typename T>
 struct is_unsigned_int {
-  constexpr static const bool value =
+  static constexpr const bool value =
       std::is_unsigned<T>::value && std::is_integral<T>::value;
 };
 
 template <typename T>
 struct is_signed_int {
-  constexpr static const bool value =
+  static constexpr const bool value =
       !std::is_unsigned<T>::value && std::is_integral<T>::value;
 };
 
@@ -206,53 +206,89 @@ struct larger_of {
 
 }  // namespace types
 
-namespace detail
-{
+
+
+template <typename T, typename Enable = void>
+struct limits;
+
+
+// limits for signed integer types
 template <typename T>
-struct signed_limits {
+struct limits<T,
+  typename std::enable_if<std::is_integral<T>::value &&
+  !std::is_unsigned<T>::value>::type>
+{
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T min()
   {
-    return static_cast<T>(1llu << ((8llu * sizeof(T)) - 1llu));
+#ifdef RAJA_COMPILER_MSVC
+#pragma warning( disable : 4309 )
+#endif
+    return static_cast<T>(1llu << ((8llu * sizeof(T)) - 1llu) );
+#ifdef RAJA_COMPILER_MSVC
+#pragma warning( default : 4309 )
+#endif
   }
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T max()
   {
+#ifdef RAJA_COMPILER_MSVC
+#pragma warning( disable : 4309 )
+#endif
     return static_cast<T>(~(1llu << ((8llu * sizeof(T)) - 1llu)));
+#ifdef RAJA_COMPILER_MSVC
+#pragma warning( default : 4309 )
+#endif
   }
 };
 
+// limits for signed integer types
 template <typename T>
-struct unsigned_limits {
+struct limits<T,
+  typename std::enable_if<std::is_integral<T>::value &&
+  std::is_unsigned<T>::value>::type>
+{
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T min()
   {
     return static_cast<T>(0);
   }
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T max()
   {
+#ifdef RAJA_COMPILER_MSVC
+#pragma warning( disable : 4309 )
+#endif
     return static_cast<T>(0xFFFFFFFFFFFFFFFF);
+#ifdef RAJA_COMPILER_MSVC
+#pragma warning( default : 4309 )
+#endif
   }
 };
 
-template <typename T>
-struct floating_point_limits {
-};
 
 template <>
-struct floating_point_limits<float> {
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float min() { return -FLT_MAX; }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float max() { return FLT_MAX; }
+struct limits<float> {
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float min()
+  {
+    return -FLT_MAX;
+  }
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float max()
+  {
+    return FLT_MAX;
+  }
 };
 
 template <>
-struct floating_point_limits<double> {
+struct limits<double> {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr double min()
   {
     return -DBL_MAX;
   }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr double max() { return DBL_MAX; }
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr double max() 
+  { 
+     return DBL_MAX; 
+  }
 };
 
 template <>
-struct floating_point_limits<long double> {
+struct limits<long double> {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr long double min()
   {
     return -LDBL_MAX;
@@ -262,16 +298,7 @@ struct floating_point_limits<long double> {
     return LDBL_MAX;
   }
 };
-}  // end namespace detail
 
-template <typename T>
-struct limits : public std::conditional<
-                    std::is_integral<T>::value,
-                    typename std::conditional<std::is_unsigned<T>::value,
-                                              detail::unsigned_limits<T>,
-                                              detail::signed_limits<T>>::type,
-                    detail::floating_point_limits<T>>::type {
-};
 
 #if defined(RAJA_CHECK_LIMITS)
 template <typename T>
@@ -389,6 +416,8 @@ struct bit_or : public detail::binary_function<Arg1, Arg2, Ret> {
   {
     return lhs | rhs;
   }
+
+RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{0}; }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
@@ -398,8 +427,11 @@ struct bit_and : public detail::binary_function<Arg1, Arg2, Ret> {
   {
     return lhs & rhs;
   }
+
+RAJA_HOST_DEVICE static constexpr Ret identity() { return ~Ret{0}; }
 };
 
+
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct bit_xor : public detail::binary_function<Arg1, Arg2, Ret> {
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
@@ -436,7 +468,7 @@ struct maximum : public detail::binary_function<Arg1, Arg2, Ret>,
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return (lhs < rhs) ? rhs : lhs;
+    return (lhs >= rhs) ? lhs : rhs;
   }
   RAJA_HOST_DEVICE static constexpr Ret identity()
   {
@@ -469,7 +501,7 @@ struct greater : public detail::comparison_function<Arg1, Arg2> {
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
-    return lhs >= rhs;
+    return lhs > rhs;
   }
 };
 
@@ -478,7 +510,7 @@ struct less : public detail::comparison_function<Arg1, Arg2> {
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
-    return lhs <= rhs;
+    return lhs < rhs;
   }
 };
 
@@ -533,7 +565,7 @@ struct project2nd : public detail::binary_function<T, U, U> {
 
 template <typename T>
 struct is_associative {
-  constexpr static const bool value =
+  static constexpr const bool value =
       std::is_base_of<detail::associative_tag, T>::value;
 };
 
diff --git a/include/RAJA/util/PermutedLayout.hpp b/include/RAJA/util/PermutedLayout.hpp
index bf79d9ba30..0b8bdaf3ac 100644
--- a/include/RAJA/util/PermutedLayout.hpp
+++ b/include/RAJA/util/PermutedLayout.hpp
@@ -25,8 +25,6 @@
 
 #include "RAJA/index/IndexValue.hpp"
 
-#include "RAJA/internal/LegacyCompatibility.hpp"
-
 #include "RAJA/util/Layout.hpp"
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/Permutations.hpp"
diff --git a/include/RAJA/util/PluginContext.hpp b/include/RAJA/util/PluginContext.hpp
index c09d312360..a7032ad67b 100644
--- a/include/RAJA/util/PluginContext.hpp
+++ b/include/RAJA/util/PluginContext.hpp
@@ -9,17 +9,24 @@
 #define RAJA_plugin_context_HPP
 
 #include "RAJA/policy/PolicyBase.hpp"
-
 #include "RAJA/internal/get_platform.hpp"
 
 namespace RAJA {
 namespace util {
 
+class KokkosPluginLoader;
+
 struct PluginContext {
-  PluginContext(const Platform p) :
-    platform(p) {}
+  public:
+    PluginContext(const Platform p) :
+      platform(p) {}
+
+    Platform platform;
+
+  private:
+    mutable uint64_t kID;
 
-  Platform platform;
+    friend class KokkosPluginLoader;
 };
 
 template<typename Policy>
diff --git a/include/RAJA/util/PluginLinker.hpp b/include/RAJA/util/PluginLinker.hpp
new file mode 100644
index 0000000000..f8d6ea5ffd
--- /dev/null
+++ b/include/RAJA/util/PluginLinker.hpp
@@ -0,0 +1,24 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_Plugin_Linker_HPP
+#define RAJA_Plugin_Linker_HPP
+
+#include "RAJA/util/RuntimePluginLoader.hpp"
+#include "RAJA/util/KokkosPluginLoader.hpp"
+
+namespace {
+  namespace anonymous_RAJA {
+    struct pluginLinker {
+      inline pluginLinker() {
+        (void)RAJA::util::linkRuntimePluginLoader();
+        (void)RAJA::util::linkKokkosPluginLoader();
+      }
+    } pluginLinker;
+  }
+}
+#endif
diff --git a/include/RAJA/util/PluginOptions.hpp b/include/RAJA/util/PluginOptions.hpp
new file mode 100644
index 0000000000..47ababd6ec
--- /dev/null
+++ b/include/RAJA/util/PluginOptions.hpp
@@ -0,0 +1,31 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_Plugin_Options_HPP
+#define RAJA_Plugin_Options_HPP
+
+#include <string>
+
+namespace RAJA {
+namespace util {
+
+struct PluginOptions
+{
+    PluginOptions(const std::string& newstr) : str(newstr) {};
+    
+    std::string str;
+};
+
+inline PluginOptions make_options(const std::string& newstr)
+{
+    return PluginOptions{newstr};
+}
+
+} // namespace util
+} // namespace RAJA
+
+#endif
diff --git a/include/RAJA/util/PluginStrategy.hpp b/include/RAJA/util/PluginStrategy.hpp
index 01680b2467..e352531c3b 100644
--- a/include/RAJA/util/PluginStrategy.hpp
+++ b/include/RAJA/util/PluginStrategy.hpp
@@ -9,22 +9,30 @@
 #define RAJA_PluginStrategy_HPP
 
 #include "RAJA/util/PluginContext.hpp"
+#include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/Registry.hpp"
 
 namespace RAJA {
 namespace util {
 
-
-class PluginStrategy 
+class PluginStrategy
 {
   public:
     PluginStrategy();
 
     virtual ~PluginStrategy() = default;
 
-    virtual void preLaunch(PluginContext p) = 0;
+    virtual void init(const PluginOptions& p);
+
+    virtual void preCapture(const PluginContext& p);
+
+    virtual void postCapture(const PluginContext& p);
+
+    virtual void preLaunch(const PluginContext& p);
+
+    virtual void postLaunch(const PluginContext& p);
 
-    virtual void postLaunch(PluginContext p) = 0;
+    virtual void finalize();
 };
 
 using PluginRegistry = Registry<PluginStrategy>;
diff --git a/include/RAJA/util/Registry.hpp b/include/RAJA/util/Registry.hpp
index 7ae9a552e6..b14ac6a01d 100644
--- a/include/RAJA/util/Registry.hpp
+++ b/include/RAJA/util/Registry.hpp
@@ -91,21 +91,21 @@ namespace util {
 
     /// A static registration template.
     template <typename V>
-    class Add {
+    class add {
       entry Entry;
       node Node;
 
       static std::shared_ptr<T> CtorFn() { return std::make_shared<V>(); }
 
     public:
-      Add(const std::string& Name, const std::string& Desc)
+      add(const std::string& Name, const std::string& Desc)
           : Entry(Name, Desc, CtorFn), Node(Entry) {
         add_node(&Node);
       }
     };
   };
 
-} // closing brace for util namespace 
+} // closing brace for util namespace
 } // closing brace for RAJA namespace
 
 #define RAJA_INSTANTIATE_REGISTRY(REGISTRY_CLASS) \
@@ -132,4 +132,4 @@ namespace util {
   } \
   }
 
-#endif 
+#endif
diff --git a/include/RAJA/util/RuntimePluginLoader.hpp b/include/RAJA/util/RuntimePluginLoader.hpp
new file mode 100644
index 0000000000..b25ddd5fc7
--- /dev/null
+++ b/include/RAJA/util/RuntimePluginLoader.hpp
@@ -0,0 +1,54 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_Runtime_Plugin_Loader_HPP
+#define RAJA_Runtime_Plugin_Loader_HPP
+
+#include <memory>
+#include <vector>
+
+#include "RAJA/util/PluginOptions.hpp"
+#include "RAJA/util/PluginStrategy.hpp"
+
+namespace RAJA {
+namespace util {
+
+  class RuntimePluginLoader : public RAJA::util::PluginStrategy
+  {
+    using Parent = RAJA::util::PluginStrategy;
+
+  public:
+    RuntimePluginLoader();
+
+    void init(const RAJA::util::PluginOptions& p) override;
+
+    void preCapture(const RAJA::util::PluginContext& p) override;
+
+    void postCapture(const RAJA::util::PluginContext& p) override;
+
+    void preLaunch(const RAJA::util::PluginContext& p) override;
+
+    void postLaunch(const RAJA::util::PluginContext& p) override;
+
+    void finalize() override;
+
+  private:
+
+    void initPlugin(const std::string &path);
+    
+    void initDirectory(const std::string &path);
+
+    std::vector<std::unique_ptr<Parent>> plugins;
+
+  };  // end RuntimePluginLoader class
+
+  void linkRuntimePluginLoader();
+
+}  // end namespace util
+}  // end namespace RAJA
+
+#endif
diff --git a/include/RAJA/util/Span.hpp b/include/RAJA/util/Span.hpp
new file mode 100644
index 0000000000..18cfa476a5
--- /dev/null
+++ b/include/RAJA/util/Span.hpp
@@ -0,0 +1,160 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file for RAJA span constructs.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_SPAN_HPP
+#define RAJA_SPAN_HPP
+
+#include <type_traits>
+
+#include "RAJA/util/concepts.hpp"
+#include "RAJA/util/macros.hpp"
+
+namespace RAJA
+{
+
+/*!
+ * @brief A view to a sequence of objects.
+ *
+ * Creates a view or container object given a random access iterator and length.
+ * Allows use of container interface functions using iterators.
+ * Indices are of the type of the second template parameter.
+ *
+ * For example:
+ *
+ *     // Create a span object for an array of ints
+ *     Span<int*, int> int_span(int_ptr, int_len);
+ *
+ *     // Use with RAJA sort
+ *     RAJA::sort<policy>(int_span);
+ *
+ *     // Create a span object another way
+ *     auto double_span = make_span(double_ptr, double_len);
+ *
+ *     // Use with RAJA scan
+ *     RAJA::inclusive_scan_inplace<policy>(double_span);
+ *
+ * Based on the std::span template.
+ * Differs in that it supports:
+ *   random access instead of contiguous iterators
+ *   different index types
+ * and does not support:
+ *   compile time extents
+ *
+ */
+template <typename IterType, typename IndexType>
+struct Span {
+  using element_type = typename std::iterator_traits<IterType>::value_type;
+  using value_type = camp::decay<element_type>;
+  using size_type = IndexType;
+  using difference_type = std::ptrdiff_t;
+  using reference = element_type&;
+  using const_reference = const element_type&;
+  using iterator = IterType;
+  using const_iterator = IterType;
+
+  static_assert(type_traits::is_integral<IndexType>::value,
+                "IndexType must model Integral");
+  static_assert(type_traits::is_random_access_iterator<IterType>::value,
+                "IterType must model RandomAccessIterator");
+
+  RAJA_HOST_DEVICE Span(iterator begin, iterator end)
+      : m_begin{begin}, m_end{end}
+  {
+  }
+
+  RAJA_HOST_DEVICE Span(iterator begin, size_type size)
+      : m_begin{begin}, m_end{begin + size}
+  {
+  }
+
+  RAJA_HOST_DEVICE RAJA_INLINE iterator begin() const { return m_begin; }
+  RAJA_HOST_DEVICE RAJA_INLINE iterator end() const { return m_end; }
+  RAJA_HOST_DEVICE RAJA_INLINE const_iterator cbegin() const { return m_begin; }
+  RAJA_HOST_DEVICE RAJA_INLINE const_iterator cend() const { return m_end; }
+
+  RAJA_HOST_DEVICE RAJA_INLINE reference front() const { return *begin(); }
+  RAJA_HOST_DEVICE RAJA_INLINE reference back() const { return *(end()-1); }
+  RAJA_HOST_DEVICE RAJA_INLINE reference operator[](size_type i) const { return data()[i]; }
+  RAJA_HOST_DEVICE RAJA_INLINE iterator data() const { return m_begin; }
+
+  RAJA_HOST_DEVICE RAJA_INLINE size_type size() const
+  {
+    return static_cast<size_type>(m_end - m_begin);
+  }
+
+  RAJA_HOST_DEVICE RAJA_INLINE bool empty() const
+  {
+    return size() == static_cast<size_type>(0);
+  }
+
+  RAJA_HOST_DEVICE RAJA_INLINE Span first(size_type count) const
+  {
+    return slice(0, count);
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE Span last(size_type count) const
+  {
+    return slice(size() - count, count);
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE Span subspan(size_type begin,
+                                            size_type length) const
+  {
+    return slice(begin, length);
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE Span slice(size_type begin,
+                                          size_type length) const
+  {
+    auto start = m_begin + begin;
+    auto end = start + length > m_end ? m_end : start + length;
+    return Span(start, end);
+  }
+
+private:
+  iterator m_begin;
+  iterator m_end;
+};
+
+/*!
+ * @brief Creates a span from a random access iterator and length.
+ * @param begin beginning of the sequence being spanned
+ * @param size length of the sequence being spanned
+ * @return Returns a Span representing the given sequence
+ *
+ * Creates a span object given a random access iterator and length.
+ *
+ * For example:
+ *
+ *     // the span type will have IndexType size_t
+ *     size_t len = ...;
+ *
+ *     // Create a span object
+ *     auto my_span = make_span(begin, len);
+ *
+ *     // Use with RAJA scan
+ *     RAJA::inclusive_scan_inplace<policy>(my_span);
+ *
+ */
+template <typename IterType, typename IndexType>
+RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType> make_span(
+    IterType begin,
+    IndexType size)
+{
+  return Span<IterType, IndexType>(begin, size);
+}
+
+}  // end namespace RAJA
+
+#endif /* RAJA_SPAN_HPP */
diff --git a/include/RAJA/util/StaticLayout.hpp b/include/RAJA/util/StaticLayout.hpp
index 52f29353de..0cc01fed74 100644
--- a/include/RAJA/util/StaticLayout.hpp
+++ b/include/RAJA/util/StaticLayout.hpp
@@ -26,11 +26,12 @@
 
 #include "RAJA/index/IndexValue.hpp"
 
-#include "RAJA/internal/LegacyCompatibility.hpp"
+#include "RAJA/internal/foldl.hpp"
 
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/Permutations.hpp"
 
+
 namespace RAJA
 {
 
@@ -38,19 +39,22 @@ namespace detail
 {
 
 
-template <typename Range, typename Sizes, typename Strides>
+template <typename IdxLin, typename Range, typename Sizes, typename Strides>
 struct StaticLayoutBase_impl;
 
 
-template <camp::idx_t... RangeInts,
-          RAJA::Index_type... Sizes,
-          RAJA::Index_type... Strides>
-struct StaticLayoutBase_impl<camp::idx_seq<RangeInts...>,
-                             camp::idx_seq<Sizes...>,
-                             camp::idx_seq<Strides...>> {
+template <typename IdxLin,
+          IdxLin... RangeInts,
+          IdxLin... Sizes,
+          IdxLin... Strides>
+struct StaticLayoutBase_impl<IdxLin,
+                             camp::int_seq<IdxLin, RangeInts...>,
+                             camp::int_seq<IdxLin, Sizes...>,
+                             camp::int_seq<IdxLin, Strides...>> {
 
-  using sizes = camp::int_seq<int, Sizes...>;
-  using strides = camp::int_seq<int, Strides...>;
+  using IndexLinear = IdxLin;
+  using sizes = camp::int_seq<IdxLin, Sizes...>;
+  using strides = camp::int_seq<IdxLin, Strides...>;
 
   /*!
    * Default constructor.
@@ -59,7 +63,7 @@ struct StaticLayoutBase_impl<camp::idx_seq<RangeInts...>,
 
   RAJA_INLINE static void print()
   {
-    VarOps::ignore_args(printf("StaticLayout: arg%d: size=%d, stride=%d\n",
+    camp::sink(printf("StaticLayout: arg%d: size=%d, stride=%d\n",
                                (int)RangeInts,
                                (int)Sizes,
                                (int)Strides)...);
@@ -74,81 +78,86 @@ struct StaticLayoutBase_impl<camp::idx_seq<RangeInts...>,
    * @return Linear space index.
    */
   template <typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr int operator()(
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
       Indices... indices) const
   {
     // dot product of strides and indices
-    return VarOps::sum<int>((indices * Strides)...);
+    return sum<IdxLin>((IdxLin(indices * Strides))...);
   }
 
 
   template <typename... Indices>
-  static RAJA_INLINE RAJA_HOST_DEVICE constexpr int s_oper(Indices... indices)
+  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin s_oper(Indices... indices)
   {
     // dot product of strides and indices
-    return VarOps::sum<int>((indices * Strides)...);
+    return sum<IdxLin>((IdxLin(indices * Strides))...);
   }
 
 
+  static constexpr IdxLin s_size =
+      RAJA::product<IdxLin>((Sizes == IdxLin(0) ? IdxLin(1) : Sizes)...);
+
   /*!
    * Computes a total size of the layout's space.
    * This is the produce of each dimensions size.
    *
    * @return Total size spanned by indices
    */
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr static RAJA::Index_type size()
+
+
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr IdxLin size()
   {
     // Multiply together all of the sizes,
     // replacing 1 for any zero-sized dimensions
-    return VarOps::foldl(RAJA::operators::multiplies<RAJA::Index_type>(),
-                         (Sizes == 0 ? 1 : Sizes)...);
+    return s_size;
   }
 
 
-  static constexpr RAJA::Index_type s_size =
-      VarOps::foldl(RAJA::operators::multiplies<RAJA::Index_type>(),
-                    (Sizes == 0 ? 1 : Sizes)...);
+
 };
 
-template <camp::idx_t N, camp::idx_t Idx, camp::idx_t... Sizes>
+template <typename IdxLin, IdxLin N, IdxLin Idx, IdxLin... Sizes>
 struct StrideCalculatorIdx {
   static_assert(N == sizeof...(Sizes), "");
 
-  using sizes_seq = camp::idx_seq<Sizes...>;
-  static constexpr camp::idx_t size = camp::seq_at<Idx, sizes_seq>::value;
-  static constexpr camp::idx_t size_last =
-      StrideCalculatorIdx<N, Idx + 1, Sizes...>::size;
-  static constexpr camp::idx_t value =
+  using sizes_seq = camp::int_seq<IdxLin, Sizes...>;
+  static constexpr IdxLin size = camp::seq_at<Idx, sizes_seq>::value;
+  static constexpr IdxLin size_last =
+      StrideCalculatorIdx<IdxLin, N, Idx + 1, Sizes...>::size;
+  static constexpr IdxLin value =
       (size_last > 0 ? size_last : 1) *
-      StrideCalculatorIdx<N, Idx + 1, Sizes...>::value;
-  static constexpr camp::idx_t stride = size > 0 ? value : 0;
+      StrideCalculatorIdx<IdxLin, N, Idx + 1, Sizes...>::value;
+  static constexpr IdxLin stride = size > 0 ? value : 0;
 };
 
-template <camp::idx_t N, camp::idx_t... Sizes>
-struct StrideCalculatorIdx<N, N, Sizes...> {
+template <typename IdxLin, IdxLin N, IdxLin... Sizes>
+struct StrideCalculatorIdx<IdxLin, N, N, Sizes...> {
   static_assert(N == sizeof...(Sizes), "");
 
-  static constexpr camp::idx_t size = 1;
-  static constexpr camp::idx_t value = 1;
-  static constexpr camp::idx_t stride = size > 0 ? value : 0;
+  static constexpr IdxLin size = 1;
+  static constexpr IdxLin value = 1;
+  static constexpr IdxLin stride = size > 0 ? value : 0;
 };
 
-template <typename Range, typename Perm, typename Sizes>
+template <typename IdxLin, typename Range, typename Perm, typename Sizes>
 struct StrideCalculator;
 
-template <camp::idx_t ... Range, camp::idx_t... Perm, camp::idx_t... Sizes>
-struct StrideCalculator<camp::idx_seq<Range...>, camp::idx_seq<Perm...>, camp::idx_seq<Sizes...>> {
+template <typename IdxLin, IdxLin ... Range, camp::idx_t... Perm, IdxLin... Sizes>
+struct StrideCalculator<IdxLin,
+                        camp::int_seq<IdxLin, Range...>,
+                        camp::idx_seq<Perm...>,
+                        camp::int_seq<IdxLin, Sizes...>> {
   static_assert(sizeof...(Sizes) == sizeof...(Perm), "");
 
-  using sizes = camp::idx_seq<Sizes...>;
-  static constexpr camp::idx_t N = sizeof...(Sizes);
-  using range = camp::idx_seq<Range...>;
+  using sizes = camp::int_seq<IdxLin, Sizes...>;
+  static constexpr IdxLin N = sizeof...(Sizes);
+  using range = camp::int_seq<IdxLin, Range...>;
   using perm = camp::idx_seq<Perm...>;
   using inv_perm = invert_permutation<perm>;
   using strides_unperm =
-      camp::idx_seq<StrideCalculatorIdx<N, Range, camp::seq_at<Perm, sizes>::value...>::stride...>;
+      camp::int_seq<IdxLin, StrideCalculatorIdx<IdxLin, N, Range, camp::seq_at<Perm, sizes>::value...>::stride...>;
   
-  using strides = camp::idx_seq<camp::seq_at<camp::seq_at<Range, inv_perm>::value, strides_unperm>::value...>;
+  using strides = camp::int_seq<IdxLin, camp::seq_at<camp::seq_at<Range, inv_perm>::value, strides_unperm>::value...>;
 };
 
 
@@ -157,6 +166,8 @@ struct TypedStaticLayoutImpl;
 
 template <typename Layout, typename... DimTypes>
 struct TypedStaticLayoutImpl<Layout, camp::list<DimTypes...>> {
+
+  using IndexLinear = typename Layout::IndexLinear;
   /*!
    * Computes a linear space index from specified indices.
    * This is formed by the dot product of the indices and the layout strides.
@@ -164,14 +175,19 @@ struct TypedStaticLayoutImpl<Layout, camp::list<DimTypes...>> {
    * @param indices  Indices in the n-dimensional space of this layout
    * @return Linear space index.
    */
-  static RAJA_INLINE RAJA_HOST_DEVICE constexpr RAJA::Index_type s_oper(
+  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear s_oper(
       DimTypes... indices)
   {
     return Layout::s_oper(stripIndexType(indices)...);
   }
 
 
-  static constexpr RAJA::Index_type s_size = Layout::s_size;
+  static constexpr IndexLinear s_size = Layout::s_size;
+
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static IndexLinear size()
+  {
+    return s_size;
+  }
 
   RAJA_INLINE
   static void print() { Layout::print(); }
@@ -181,22 +197,25 @@ struct TypedStaticLayoutImpl<Layout, camp::list<DimTypes...>> {
 }  // namespace detail
 
 
-template <typename Perm, camp::idx_t... Sizes>
-using StaticLayout = detail::StaticLayoutBase_impl<
-    camp::make_idx_seq_t<sizeof...(Sizes)>,
-    camp::idx_seq<Sizes...>,
-    typename detail::StrideCalculator<camp::make_idx_seq_t<sizeof...(Sizes)>,
+template <typename Perm, typename IdxLin, camp::idx_t... Sizes>
+using StaticLayoutT = detail::StaticLayoutBase_impl<
+    IdxLin,
+    camp::make_int_seq_t<IdxLin, sizeof...(Sizes)>,
+    camp::int_seq<IdxLin, Sizes...>,
+    typename detail::StrideCalculator<IdxLin,
+                                      camp::make_int_seq_t<IdxLin, sizeof...(Sizes)>,
                                       Perm,
-                                      camp::idx_seq<Sizes...>>::strides>;
-
+                                      camp::int_seq<IdxLin, Sizes...>>::strides>;
 
 
+template <typename Perm, camp::idx_t... Sizes>
+using StaticLayout = StaticLayoutT<Perm, camp::idx_t, Sizes...>;
 
 
 
-template <typename Perm, typename TypeList, camp::idx_t... Sizes>
+template <typename Perm, typename IdxLin, typename TypeList, camp::idx_t... Sizes>
 using TypedStaticLayout =
-    detail::TypedStaticLayoutImpl<StaticLayout<Perm, Sizes...>, TypeList>;
+    detail::TypedStaticLayoutImpl<StaticLayoutT<Perm, IdxLin, Sizes...>, TypeList>;
 
 
 }  // namespace RAJA
diff --git a/include/RAJA/util/Timer.hpp b/include/RAJA/util/Timer.hpp
index 101d8bedb6..e5d3ea0584 100644
--- a/include/RAJA/util/Timer.hpp
+++ b/include/RAJA/util/Timer.hpp
@@ -56,7 +56,9 @@ class BGQTimer
 
 public:
   BGQTimer() : tstart(), tstop(), telapsed(0) {}
+
   void start() { gettimeofday(&tstart, 0); }
+
   void stop()
   {
     gettimeofday(&tstop, 0);
@@ -110,7 +112,9 @@ class ChronoTimer
   ChronoTimer() : tstart(ClockType::now()), tstop(ClockType::now()), telapsed(0)
   {
   }
+
   void start() { tstart = ClockType::now(); }
+
   void stop()
   {
     tstop = ClockType::now();
@@ -157,7 +161,9 @@ class GettimeTimer
 
 public:
   GettimeTimer() : telapsed(0), stime_elapsed(0), nstime_elapsed(0) { ; }
+
   void start() { clock_gettime(CLOCK_MONOTONIC, &tstart); }
+
   void stop()
   {
     clock_gettime(CLOCK_MONOTONIC, &tstop);
@@ -218,6 +224,7 @@ class ClockTimer
   ClockTimer() : telapsed(0) { ; }
 
   void start() { tstart = clock(); }
+
   void stop()
   {
     tstop = clock();
diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp
index 698f4e2abc..bb809d8f43 100644
--- a/include/RAJA/util/View.hpp
+++ b/include/RAJA/util/View.hpp
@@ -52,9 +52,13 @@ struct View {
   using value_type = ValueType;
   using pointer_type = PointerType;
   using layout_type = LayoutType;
-  using nc_value_type = typename std::remove_const<value_type>::type;
-  using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
-      typename std::remove_pointer<pointer_type>::type>::type>::type;
+  using nc_value_type = camp::decay<value_type>;
+  using nc_pointer_type = 
+    camp::type::ptr::add< // adds *
+      camp::type::cv::rem<
+        camp::type::ptr::rem<pointer_type>  // removes *
+      >
+    >;
   using NonConstView = View<nc_value_type, layout_type, nc_pointer_type>;
 
   layout_type const layout;
@@ -111,6 +115,166 @@ struct View {
   }
 };
 
+
+// select certain indices from a tuple, given a curated index sequence
+// returns linear index of layout(ar...)
+template <typename Lay, typename Tup, camp::idx_t... Idxs>
+RAJA_HOST_DEVICE RAJA_INLINE 
+RAJA::Index_type selecttuple( Lay lyout, Tup&& tup, camp::idx_seq<Idxs...> )
+{ 
+  return lyout(
+                camp::get<Idxs>(std::forward<Tup>(tup))...
+              );
+}
+
+// sequence combiner
+template <typename Seq1, typename Seq2>
+struct cat_seq;
+
+template <camp::idx_t... Idxs1, camp::idx_t... Idxs2>
+struct cat_seq  < camp::idx_seq<Idxs1...>,
+                  camp::idx_seq<Idxs2...>
+                >
+{
+  using type = camp::idx_seq<Idxs1..., Idxs2...>;
+};
+
+template <typename Seq1, typename Seq2>
+using cat_seq_t = typename cat_seq<Seq1, Seq2>::type;
+
+// sequence offsetter
+template <camp::idx_t Offset, typename Seq>
+struct offset_seq;
+
+template <camp::idx_t Offset, camp::idx_t... Idxs>
+struct offset_seq<Offset, camp::idx_seq<Idxs...>>
+{
+  using type = camp::idx_seq<(Idxs+Offset)...>;
+};
+
+template <camp::idx_t Offset, typename Seq>
+using offset_seq_t = typename offset_seq<Offset, Seq>::type;
+
+// remove the Nth index in a parameter pack
+// returns linear index of layout(ar...)
+template <typename Lay, RAJA::Index_type Nth = 0, typename Tup>
+RAJA_HOST_DEVICE RAJA_INLINE auto removenth( Lay lyout, Tup&& tup ) ->
+  decltype( selecttuple<Lay>(
+              lyout,
+              std::forward<Tup>(tup),
+              cat_seq_t<  camp::make_idx_seq_t<Nth>,  // sequence up to Nth
+                          offset_seq_t<
+                            Nth+1,  // after Nth
+                            camp::make_idx_seq_t<camp::tuple_size<Tup>::value - Nth-1>
+                          > // sequence after Nth
+                       >{}
+            )
+          )
+{
+  return selecttuple<Lay>(
+              lyout,
+              std::forward<Tup>(tup),
+              cat_seq_t<  camp::make_idx_seq_t<Nth>,  // sequence up to Nth
+                          offset_seq_t<
+                            Nth+1,  // after Nth
+                            camp::make_idx_seq_t<camp::tuple_size<Tup>::value - Nth-1>
+                          > // sequence after Nth
+                       >{}
+          );
+}
+
+
+
+
+// P2Pidx represents the array-of-pointers index. This allows the position of the
+// index into the array-of-pointers to be moved around in the MultiView operator();
+// see the operator overload.
+// Default of 0 means that the p2p index is in the 0th position.
+template <typename ValueType,
+          typename LayoutType,
+          RAJA::Index_type P2Pidx = 0,
+          typename PointerType = ValueType **>
+struct MultiView {
+  using value_type = ValueType;
+  using pointer_type = PointerType;
+  using layout_type = LayoutType;
+  using nc_value_type = camp::decay<value_type>;
+  using nc_pointer_type =  
+    camp::type::ptr::add< // adds *
+      camp::type::ptr::add<
+        camp::type::cv::rem<  // removes cv
+          camp::type::ptr::rem<
+            camp::type::ptr::rem<pointer_type>  // removes *
+          >
+        >
+      >
+    >;
+  using NonConstView = MultiView<nc_value_type, layout_type, P2Pidx, nc_pointer_type>;
+
+  layout_type const layout;
+  nc_pointer_type data;
+
+  template <typename... Args>
+  RAJA_INLINE constexpr MultiView(pointer_type data_ptr, Args... dim_sizes)
+      : layout(dim_sizes...), data(data_ptr)
+  {
+  }
+
+  RAJA_INLINE constexpr MultiView(pointer_type data_ptr, layout_type &&layout)
+      : layout(layout), data(data_ptr)
+  {
+  }
+
+  // We found the compiler-generated copy constructor does not actually
+  // copy-construct the object on the device in certain nvcc versions. By
+  // explicitly defining the copy constructor we are able ensure proper
+  // behavior. Git-hub pull request link https://github.com/LLNL/RAJA/pull/477
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr MultiView(MultiView const &V)
+      : layout(V.layout), data(V.data)
+  {
+  }
+
+  template <bool IsConstView = std::is_const<value_type>::value>
+  RAJA_INLINE constexpr MultiView(
+      typename std::enable_if<IsConstView, NonConstView>::type const &rhs)
+      : layout(rhs.layout),
+        data(rhs.data)
+  {
+  }
+
+  RAJA_INLINE void set_data(pointer_type data_ptr) { data = data_ptr; }
+
+  template <size_t n_dims=layout_type::n_dims, typename IdxLin = Index_type>
+  RAJA_INLINE RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>
+  shift(const std::array<IdxLin, n_dims>& shift)
+  {
+    static_assert(n_dims==layout_type::n_dims, "Dimension mismatch in view shift");
+
+    typename add_offset<layout_type>::type shift_layout(layout);
+    shift_layout.shift(shift);
+
+    return RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>(data, shift_layout);
+  }
+
+  // Moving the position of the index into the array-of-pointers
+  // is set by P2Pidx, which is defaulted to 0.
+  // making this specifically typed would require unpacking the layout,
+  // this is easier to maintain
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE value_type &operator()(Args... ar) const
+  {
+    auto pidx = stripIndexType( camp::get<P2Pidx>( camp::forward_as_tuple( ar... ) ) );
+
+    if ( pidx < 0 )
+    {
+      RAJA_ABORT_OR_THROW( "Negative index while accessing array of pointers.\n" );
+    }
+    
+    auto idx = stripIndexType( removenth<LayoutType, P2Pidx>( layout, camp::forward_as_tuple( ar... ) ) );
+    return data[pidx][idx];
+  }
+};
+
 template <typename ValueType,
           typename PointerType,
           typename LayoutType,
diff --git a/include/RAJA/util/align.hpp b/include/RAJA/util/align.hpp
index 3530428977..88671c9037 100644
--- a/include/RAJA/util/align.hpp
+++ b/include/RAJA/util/align.hpp
@@ -30,11 +30,15 @@ namespace RAJA
 RAJA_INLINE
 void* align(size_t alignment, size_t size, void*& ptr, size_t& space)
 {
+
+#ifdef RAJA_COMPILER_MSVC
+#pragma warning( disable : 4146 )  // Force msvc to ignore subtracting from signed number warning
+#endif
   void* r = nullptr;
   if (size <= space) {
     char* p1 = static_cast<char*>(ptr);
     char* p2 = reinterpret_cast<char*>(
-        reinterpret_cast<size_t>(p1 + (alignment - 1)) & -alignment);
+        reinterpret_cast<size_t>(p1 + (static_cast<ptrdiff_t>(alignment) - 1)) & -alignment);
     size_t d = static_cast<size_t>(p2 - p1);
     if (d <= space - size) {
       r = p2;
@@ -43,6 +47,11 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space)
     }
   }
   return r;
+
+#ifdef RAJA_COMPILER_MSVC
+#pragma warning( default : 4146 )  // Force msvc to ignore subtracting from signed number warning
+#endif
+
 }
 
 }  // end namespace RAJA
diff --git a/include/RAJA/util/basic_mempool.hpp b/include/RAJA/util/basic_mempool.hpp
index 276b0a5e14..c06d978046 100644
--- a/include/RAJA/util/basic_mempool.hpp
+++ b/include/RAJA/util/basic_mempool.hpp
@@ -60,10 +60,11 @@ class MemoryArena
   using used_value_type = typename used_type::value_type;
 
   MemoryArena(void* ptr, size_t size)
-      : m_allocation{ptr, static_cast<char*>(ptr) + size},
-        m_free_space({free_value_type{ptr, static_cast<char*>(ptr) + size}}),
-        m_used_space()
+    : m_allocation{ ptr, static_cast<char*>(ptr)+size },
+      m_free_space(),
+      m_used_space()
   {
+     m_free_space[ptr] = static_cast<char*>(ptr)+size ;
     if (m_allocation.begin == nullptr) {
       fprintf(stderr, "Attempt to create MemoryArena with no memory");
       std::abort();
diff --git a/include/RAJA/util/camp_aliases.hpp b/include/RAJA/util/camp_aliases.hpp
index 396aefa815..0466013482 100644
--- a/include/RAJA/util/camp_aliases.hpp
+++ b/include/RAJA/util/camp_aliases.hpp
@@ -23,17 +23,19 @@
 #ifndef RAJA_CAMP_ALIASES_HPP
 #define RAJA_CAMP_ALIASES_HPP
 
+#include "RAJA/config.hpp"
+#include "RAJA/util/macros.hpp"
+
 #include "camp/defines.hpp"
 #include "camp/list/list.hpp"
 #include "camp/tuple.hpp"
+#include "camp/resource.hpp"
 
 namespace RAJA
 {
 
 using ::camp::at_v;
 
-using ::camp::get;
-
 using ::camp::list;
 
 using ::camp::idx_t;
@@ -42,6 +44,50 @@ using ::camp::make_tuple;
 
 using ::camp::tuple;
 
+using ::camp::resources::Platform;
+
+// make own tuple_element
+template < camp::idx_t I, typename Tuple >
+struct tuple_element;
+
+// specialization for RAJA/camp::tuple
+template < camp::idx_t I, typename ... Ts >
+struct tuple_element<I, tuple<Ts...>>
+  : camp::tuple_element<I, tuple<Ts...>>
+{ };
+
+// convenience alias
+template < camp::idx_t I, typename Tuple >
+using tuple_element_t = typename tuple_element<I, Tuple>::type;
+
+// get function overloads for tuple
+// the reference type returned by get depends on the reference type
+// of the zip_tuple that get is called on
+template < camp::idx_t I, typename ... Ts >
+// RAJA_HOST_DEVICE RAJA_INLINE                                RAJA::tuple_element_t<I, tuple<Ts...>>             &
+// RAJA_HOST_DEVICE RAJA_INLINE decltype(camp::get<I>(camp::val<tuple<Ts...>      & >()))
+RAJA_HOST_DEVICE RAJA_INLINE auto get(tuple<Ts...>      &  t)
+  -> decltype(camp::get<I>(t))
+{ return camp::get<I>(          t ); }
+template < camp::idx_t I, typename ... Ts >
+// RAJA_HOST_DEVICE RAJA_INLINE                                RAJA::tuple_element_t<I, tuple<Ts...>>        const&
+// RAJA_HOST_DEVICE RAJA_INLINE decltype(camp::get<I>(camp::val<tuple<Ts...> const& >()))
+RAJA_HOST_DEVICE RAJA_INLINE auto get(tuple<Ts...> const&  t)
+  -> decltype(camp::get<I>(t))
+{ return camp::get<I>(          t ); }
+template < camp::idx_t I, typename ... Ts >
+// RAJA_HOST_DEVICE RAJA_INLINE typename std::remove_reference<RAJA::tuple_element_t<I, tuple<Ts...>>>::type      &&
+// RAJA_HOST_DEVICE RAJA_INLINE decltype(camp::get<I>(camp::val<tuple<Ts...>      &&>()))
+RAJA_HOST_DEVICE RAJA_INLINE auto get(tuple<Ts...>      && t)
+  -> decltype(camp::get<I>(std::move(t)))
+{ return camp::get<I>(std::move(t)); }
+template < camp::idx_t I, typename ... Ts >
+// RAJA_HOST_DEVICE RAJA_INLINE typename std::remove_reference<RAJA::tuple_element_t<I, tuple<Ts...>>>::type const&&
+// RAJA_HOST_DEVICE RAJA_INLINE decltype(camp::get<I>(camp::val<tuple<Ts...> const&&>()))
+RAJA_HOST_DEVICE RAJA_INLINE auto get(tuple<Ts...> const&& t)
+  -> decltype(camp::get<I>(std::move(t)))
+{ return camp::get<I>(std::move(t)); }
+
 }  // end namespace RAJA
 
 #endif /* RAJA_CAMP_ALIASES_HPP */
diff --git a/include/RAJA/util/concepts.hpp b/include/RAJA/util/concepts.hpp
index bdd1223a0b..598cee5ef1 100644
--- a/include/RAJA/util/concepts.hpp
+++ b/include/RAJA/util/concepts.hpp
@@ -31,11 +31,19 @@ namespace RAJA
 namespace concepts
 {
 using namespace camp::concepts;
+
+template <typename From, typename To>
+struct ConvertibleTo
+  : DefineConcept(::RAJA::concepts::convertible_to<To>(camp::val<From>())) {
+};
+
 }
 
 namespace type_traits
 {
 using namespace camp::type_traits;
+
+DefineTypeTraitFromConcept(convertible_to, concepts::ConvertibleTo);
 }
 
 }  // end namespace RAJA
diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp
index de551be63a..ea135cdb93 100644
--- a/include/RAJA/util/macros.hpp
+++ b/include/RAJA/util/macros.hpp
@@ -28,11 +28,12 @@
 #endif
 
 //
-// Macros for decorating host/device functions for CUDA kernels.
+// Macros for decorating host/device functions for CUDA and HIP kernels.
 // We need a better solution than this as it is a pain to manage
 // this stuff in an application.
 //
-#if defined(RAJA_ENABLE_CUDA) && defined(__CUDA_ARCH__)
+#if (defined(RAJA_ENABLE_CUDA) && defined(__CUDA_ARCH__)) \
+ || (defined(RAJA_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__))
 #define RAJA_DEVICE_CODE
 #endif
 
@@ -122,13 +123,35 @@ RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T &&...) noexcept
   (((dividend) + (divisor)-1) / (divisor))
 
 
+RAJA_HOST_DEVICE
 inline void RAJA_ABORT_OR_THROW(const char *str)
 {
-  if (std::getenv("RAJA_NO_EXCEPT") != nullptr) {
+#if defined(__CUDA_ARCH__)
+  asm ("trap;");
+
+#elif defined(__HIP_DEVICE_COMPILE__)
+  abort();
+
+#else
+#ifdef RAJA_COMPILER_MSVC
+  char *value;
+  size_t len;
+  bool no_except = false;
+  if(_dupenv_s(&value, &len, "RAJA_NO_EXCEPT") == 0 && value != nullptr){
+    no_except = true;
+    free(value);
+  }
+
+#else
+  bool no_except = std::getenv("RAJA_NO_EXCEPT") != nullptr;
+#endif
+
+  if (no_except) {
     std::abort();
   } else {
     throw std::runtime_error(str);
   }
+#endif
 }
 
 //! Macros for marking deprecated features in RAJA
@@ -151,7 +174,6 @@ inline void RAJA_ABORT_OR_THROW(const char *str)
 #endif
 
 #if defined(RAJA_HAS_CXX_ATTRIBUTE_DEPRECATED)
-
 // When using a C++14 compiler, use the standard-specified deprecated attribute
 #define RAJA_DEPRECATE(Msg) [[deprecated(Msg)]]
 #define RAJA_DEPRECATE_ALIAS(Msg) [[deprecated(Msg)]]
diff --git a/include/RAJA/util/plugins.hpp b/include/RAJA/util/plugins.hpp
index bd10252a71..e5c95bc245 100644
--- a/include/RAJA/util/plugins.hpp
+++ b/include/RAJA/util/plugins.hpp
@@ -9,16 +9,50 @@
 #define RAJA_plugins_HPP
 
 #include "RAJA/util/PluginContext.hpp"
+#include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/PluginStrategy.hpp"
+#include "RAJA/util/RuntimePluginLoader.hpp"
+#include "RAJA/util/KokkosPluginLoader.hpp"
 
 namespace RAJA {
 namespace util {
 
-inline
+template <typename T>
+RAJA_INLINE auto trigger_updates_before(T&& item)
+  -> typename std::remove_reference<T>::type
+{
+  return item;
+}
+
+RAJA_INLINE
 void
-callPreLaunchPlugins(PluginContext p) noexcept
+callPreCapturePlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin(); 
+  for (auto plugin = PluginRegistry::begin();
+      plugin != PluginRegistry::end();
+      ++plugin)
+  {
+    (*plugin).get()->preCapture(p);
+  }
+}
+
+RAJA_INLINE
+void
+callPostCapturePlugins(const PluginContext& p)
+{
+  for (auto plugin = PluginRegistry::begin();
+      plugin != PluginRegistry::end();
+      ++plugin)
+  {
+    (*plugin).get()->postCapture(p);
+  }
+}
+
+RAJA_INLINE
+void
+callPreLaunchPlugins(const PluginContext& p)
+{
+  for (auto plugin = PluginRegistry::begin();
       plugin != PluginRegistry::end();
       ++plugin)
   {
@@ -26,11 +60,11 @@ callPreLaunchPlugins(PluginContext p) noexcept
   }
 }
 
-inline
+RAJA_INLINE
 void
-callPostLaunchPlugins(PluginContext p) noexcept
+callPostLaunchPlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin(); 
+  for (auto plugin = PluginRegistry::begin();
       plugin != PluginRegistry::end();
       ++plugin)
   {
@@ -38,6 +72,44 @@ callPostLaunchPlugins(PluginContext p) noexcept
   }
 }
 
+RAJA_INLINE
+void
+callInitPlugins(const PluginOptions p)
+{
+  for (auto plugin = PluginRegistry::begin(); 
+      plugin != PluginRegistry::end();
+      ++plugin)
+  {
+    (*plugin).get()->init(p);
+  }
+}
+
+RAJA_INLINE
+void
+init_plugins(const std::string& path)
+{   
+  callInitPlugins(make_options(path));
+}
+
+RAJA_INLINE
+void
+init_plugins()
+{   
+  callInitPlugins(make_options(""));
+}
+
+RAJA_INLINE
+void
+finalize_plugins()
+{   
+  for (auto plugin = PluginRegistry::begin(); 
+    plugin != PluginRegistry::end();
+    ++plugin)
+  {
+    (*plugin).get()->finalize();
+  }
+}
+
 } // closing brace for util namespace
 } // closing brace for RAJA namespace
 
diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp
new file mode 100644
index 0000000000..24f116082b
--- /dev/null
+++ b/include/RAJA/util/resource.hpp
@@ -0,0 +1,84 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file for RAJA resource definitions.
+ *
+ *          Definitions in this file will propagate to all RAJA header files.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_resource_HPP
+#define RAJA_resource_HPP
+
+#include "camp/resource.hpp"
+#include "RAJA/policy/cuda/policy.hpp"
+#include "RAJA/policy/hip/policy.hpp"
+#include "RAJA/policy/sequential/policy.hpp"
+
+namespace RAJA
+{
+
+  namespace resources
+  {
+  using namespace camp::resources;
+
+  template<typename e>
+  struct get_resource{
+    using type = Host;
+  };
+
+  template<typename ExecPol>
+  constexpr auto get_default_resource() -> typename get_resource<ExecPol>::type {
+    return get_resource<ExecPol>::type::get_default();
+  }
+
+#if defined(RAJA_ENABLE_CUDA)
+  template<size_t BlockSize, bool Async>
+  struct get_resource<cuda_exec<BlockSize, Async>>{
+    using type = Cuda;
+  };
+
+  template<typename ISetIter, size_t BlockSize, bool Async>
+  struct get_resource<ExecPolicy<ISetIter, cuda_exec<BlockSize, Async>>>{
+    using type = Cuda;
+  };
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+  template<size_t BlockSize, bool Async>
+  struct get_resource<hip_exec<BlockSize, Async>>{
+    using type = Hip;
+  };
+
+  template<typename ISetIter, size_t BlockSize, bool Async>
+  struct get_resource<ExecPolicy<ISetIter, hip_exec<BlockSize, Async>>>{
+    using type = Hip;
+  };
+#endif
+  } // end namespace resources
+
+  namespace type_traits
+  {
+    template <typename T> struct is_resource : std::false_type {};
+    template <> struct is_resource<resources::Host> : std::true_type {};
+#if defined(RAJA_ENABLE_CUDA)
+    template <> struct is_resource<resources::Cuda> : std::true_type {};
+#endif
+#if defined(RAJA_ENABLE_HIP)
+    template <> struct is_resource<resources::Hip> : std::true_type {};
+#endif
+  } // end namespace type_traits
+
+}  // end namespace RAJA
+
+#endif //RAJA_resources_HPP#
diff --git a/include/RAJA/util/sort.hpp b/include/RAJA/util/sort.hpp
new file mode 100644
index 0000000000..4d953c5110
--- /dev/null
+++ b/include/RAJA/util/sort.hpp
@@ -0,0 +1,784 @@
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file providing RAJA sort templates.
+*
+******************************************************************************
+*/
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_util_sort_HPP
+#define RAJA_util_sort_HPP
+
+#include "RAJA/config.hpp"
+
+#include <iterator>
+#include <memory>
+
+#include "RAJA/pattern/detail/algorithm.hpp"
+
+#include "RAJA/util/macros.hpp"
+
+#include "RAJA/util/concepts.hpp"
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+    \brief evaluate log base 2 of N rounded down to the nearest integer >= 0
+*/
+RAJA_HOST_DEVICE RAJA_INLINE
+unsigned
+ulog2(size_t N)
+{
+  unsigned val = 0;
+
+  while (N > 1) {
+    val += 1;
+    N >>= 1;
+  }
+
+  return val;
+}
+
+/*!
+    \brief unstable partition given range inplace using predicate function
+    and using O(N) predicate evaluations and O(1) memory
+*/
+template <typename Iter, typename Predicate>
+RAJA_HOST_DEVICE RAJA_INLINE
+Iter
+partition(Iter begin,
+          Iter end,
+          Predicate pred)
+{
+  using ::RAJA::safe_iter_swap;
+
+  if (begin == end) {
+    return begin;
+  }
+
+  // advance to first false
+  Iter first_false = begin;
+  for (; first_false != end; ++first_false) {
+
+    if (!pred(first_false)) {
+      break;
+    }
+  }
+
+  // return if none were false
+  if (first_false == end) {
+    return first_false;
+  }
+
+  // advance through rest of list to find the next true
+  for (Iter next_true = RAJA::next(first_false); next_true != end; ++next_true) {
+
+    // find the end of a range of falses [first_false, next_true)
+    if (pred(next_true)) {
+
+      // shift the known range of falses forward
+      // by swapping the true to the beginning of the range
+      safe_iter_swap(first_false, next_true);
+      ++first_false;
+    }
+  }
+
+  return first_false;
+}
+
+/*!
+    \brief stable insertion sort given range inplace using comparison function
+    and using O(N^2) comparisons and O(1) memory
+*/
+template <typename Iter, typename Compare>
+RAJA_HOST_DEVICE RAJA_INLINE
+void
+insertion_sort(Iter begin,
+               Iter end,
+               Compare comp)
+{
+  using ::RAJA::safe_iter_swap;
+
+  if (begin == end) {
+    return;
+  }
+
+  // for each unsorted item in the right side of the range
+  for (Iter next_unsorted = RAJA::next(begin); next_unsorted != end; ++next_unsorted) {
+
+    // insert unsorted item into the sorted left side of the range
+    for (Iter to_insert = next_unsorted; to_insert != begin; --to_insert) {
+
+      Iter next_sorted = RAJA::prev(to_insert);
+
+      // compare with next item to left
+      if (comp(*to_insert, *next_sorted)) {
+
+        // swap down if should be before
+        safe_iter_swap(next_sorted, to_insert);
+
+      } else {
+
+        // stop if in correct position
+        break;
+      }
+    }
+  }
+}
+
+/*!
+    \brief get number of strides for shell sort
+*/
+RAJA_HOST_DEVICE RAJA_INLINE
+constexpr size_t num_shell_strides()
+{
+  return 39;
+}
+
+/*!
+    \brief get strides for shell sort
+*/
+RAJA_HOST_DEVICE RAJA_INLINE
+constexpr long long unsigned get_shell_stride(int i)
+{
+  using array_type = long long unsigned[num_shell_strides()];
+  return (array_type{
+      // strides from M. Ciura 2001
+      1llu, 4llu, 10llu, 23llu, 57llu, 132llu, 301llu, 701llu, 1750llu,
+      // extended up to 2^47 with strides[n] = floor(2.25*strides[n-1])
+      3937llu, 8858llu, 19930llu, 44842llu, 100894llu, 227011llu, 510774llu,
+      1149241llu, 2585792llu, 5818032llu, 13090572llu, 29453787llu, 66271020llu,
+      149109795llu, 335497038llu, 754868335llu, 1698453753llu, 3821520944llu,
+      8598422124llu, 19346449779llu, 43529512002llu, 97941402004llu,
+      220368154509llu, 495828347645llu, 1115613782201llu, 2510131009952llu,
+      5647794772392llu, 12707538237882llu, 28591961035234llu, 64331912329276llu
+    })[i];
+}
+
+/*!
+    \brief unstable shell sort given range inplace using comparison function
+    and using O(N^?) comparisons and O(1) memory
+*/
+template <typename Iter, typename Compare>
+RAJA_HOST_DEVICE RAJA_INLINE
+void
+shell_sort(Iter begin,
+           Iter end,
+           Compare comp)
+{
+  using ::RAJA::safe_iter_swap;
+  using diff_type = ::RAJA::detail::IterDiff<Iter>;
+
+  diff_type n = end - begin;
+
+  if (n <= static_cast<diff_type>(1)) {
+    return;
+  } else if (get_shell_stride(1) < static_cast<unsigned long long>(n)) {
+
+    int i_stride = 2;
+    // find first stride larger than n
+    constexpr int num_strides = num_shell_strides();
+    for (; i_stride < num_strides; ++i_stride) {
+      if (get_shell_stride(i_stride) >= static_cast<unsigned long long>(n)) {
+        break;
+      }
+    }
+    // back up to first stride smaller than n
+    i_stride -= 1;
+
+    // for each stride size smaller than n, largest to smallest, not including 1
+    // sort strided ranges with stride stride
+    for (; i_stride > 0; --i_stride) {
+      diff_type stride = static_cast<diff_type>(get_shell_stride(i_stride));
+
+      // for each unsorted item in the right side of each strided range
+      for (diff_type i_next_unsorted = stride; i_next_unsorted != n; ++i_next_unsorted) {
+
+        // insert unsorted item into the sorted left side of the strided range
+        for (diff_type i_to_insert = i_next_unsorted; i_to_insert >= stride; i_to_insert -= stride) {
+
+          Iter to_insert = begin + i_to_insert;
+          Iter next_sorted = to_insert - stride;
+
+          // compare with next item to left
+          if (comp(*to_insert, *next_sorted)) {
+
+            // swap down if should be before
+            safe_iter_swap(next_sorted, to_insert);
+
+          } else {
+
+            // stop if in correct position
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  // finish with stride size of 1, which is just normal insertion_sort
+  RAJA::detail::insertion_sort(begin, end, comp);
+}
+
+/*!
+    \brief insert the given element into the heaps below it
+    using comparison function
+    and using O(lg(N)) comparisons and O(1) memory
+*/
+template <typename Iter, typename Compare>
+RAJA_HOST_DEVICE RAJA_INLINE
+void
+heapify(Iter begin,
+        Iter root,
+        Iter end,
+        Compare comp)
+{
+  using RAJA::safe_iter_swap;
+
+  auto N = end - begin;
+
+  // heapify the root node into place
+  // until this is a max heap again
+  for (auto i = root - begin; 2*i+1 < N; i = root - begin) {
+
+    // find the max item amongst the root, left child, and right child
+    Iter maxit = root;
+
+    // left child
+    Iter child = begin + 2*i+1;
+    if (comp(*maxit, *child)) {
+      maxit = child;
+    }
+
+    // right child
+    ++child;
+    if (child != end && comp(*maxit, *child)) {
+      maxit = child;
+    }
+
+    if (maxit == root) {
+      // root is the max, done
+      break;
+    }
+
+    // swap max child with root
+    safe_iter_swap(root, maxit);
+    // continue to heapify with the former max child
+    root = maxit;
+  }
+}
+
+/*!
+    \brief unstable heap sort given range inplace using comparison function
+    and using O(N*lg(N)) comparisons and O(1) memory
+*/
+template <typename Iter, typename Compare>
+RAJA_HOST_DEVICE inline
+void
+heap_sort(Iter begin,
+          Iter end,
+          Compare comp)
+{
+  using RAJA::safe_iter_swap;
+
+  auto N = end - begin;
+
+  if (N < 2) {
+    // already sorted
+    return;
+  }
+
+  // make range into a max heap by
+  // going through nodes with children one-by-one in reverse order
+  for (Iter root = begin + (N-1)/2; root != begin; --root) {
+    // heapify a sub-heap
+    heapify(begin, root, end, comp);
+  }
+  // finish heapifying
+  heapify(begin, begin, end, comp);
+
+  // remove one element from max heap repeatedly until sorted
+  for (--end; begin != end; --end) {
+
+    // swap max element into sorted position at end of heap
+    safe_iter_swap(begin, end);
+
+    // fix top item of heap
+    heapify(begin, begin, end, comp);
+  }
+}
+
+/*!
+    \brief max recursion depth for intro sort when compiling device code.
+*/
+struct intro_sort_device_max_depth
+{
+  static constexpr unsigned get() { return 4; }
+};
+
+/*!
+    \brief cutoff for intro sort to use insertion sort on small ranges.
+*/
+struct intro_sort_insertion_sort_cutoff
+{
+  static constexpr size_t get() { return 16; }
+};
+
+/*!
+    \brief unstable intro sort given range inplace using comparison function
+    and using O(N*lg(N)) comparisons and O(lg(N)) memory
+*/
+template <typename Iter, typename Compare>
+RAJA_HOST_DEVICE inline
+void
+intro_sort(Iter begin,
+           Iter end,
+           Compare comp,
+           unsigned depth)
+{
+  using RAJA::safe_iter_swap;
+  using diff_type = ::RAJA::detail::IterDiff<Iter>;
+
+  diff_type N = end - begin;
+
+  // cutoff to use insertion sort
+  constexpr diff_type insertion_sort_cutoff =
+      static_cast<diff_type>(intro_sort_insertion_sort_cutoff::get());
+
+  if (N < 2) {
+
+    // already sorted
+
+  } else if (N < insertion_sort_cutoff) {
+
+    // use insertion sort for small inputs
+    detail::insertion_sort(begin, end, comp);
+
+  } else if (depth == 0) {
+
+    // use heap sort if recurse too deep
+    detail::heap_sort(begin, end, comp);
+
+  } else {
+
+    // use quick sort
+    // choose pivot with median of 3 (N >= insertion_sort_cutoff)
+    Iter mid = begin + N/2;
+    Iter last = end-1;
+    Iter pivot = comp(*begin, *mid)
+                    ? ( comp(*mid, *last)
+                           ? mid
+                           : ( comp(*begin, *last)
+                                  ? last
+                                  : begin ) )
+                    : ( comp(*mid, *last)
+                           ? ( comp(*begin, *last)
+                                  ? begin
+                                  : last )
+                           : mid );
+
+    // swap pivot to last
+    if (pivot != last) {
+      safe_iter_swap(pivot, last);
+      pivot = last;
+    }
+
+    // partition
+    mid = partition(begin, last, [&](Iter it){ return comp(*it, *pivot); });
+
+    // swap pivot to sorted position
+    if (mid != pivot) {
+      safe_iter_swap(mid, pivot);
+      pivot = mid;
+    }
+
+    // recurse to sort first and second parts, ignoring already sorted pivot
+    // by construction pivot is always in the range [begin, last]
+    detail::intro_sort(begin, pivot, comp, depth-1);
+    detail::intro_sort(RAJA::next(pivot), end, comp, depth-1);
+  }
+}
+
+/*!
+    \brief merge a range with midpoint using comparison function
+    with local range/2 copy
+*/
+template <typename Iter, typename Compare>
+void
+RAJA_INLINE 
+inplace_merge(  Iter first,
+                Iter middle,
+                Iter last,
+                Compare comp  )
+{
+  using diff_type = RAJA::detail::IterDiff<Iter>;
+  using value_type = RAJA::detail::IterVal<Iter>;
+
+  diff_type copylen = middle - first;
+
+  if ( first == middle || middle == last )
+  {
+    // at least one side empty, already sorted
+    return;
+  }
+
+  if ( !comp(*middle, *(middle-1)) )
+  {
+    // everything already in order, done
+    return;
+  }
+
+  // Manage the lifetime of the buffer and objects constructed in the buffer
+  using buf_deleter_type = FreeAlignedType<value_type, diff_type>;
+  buf_deleter_type buf_deleter;
+
+  std::unique_ptr<value_type, buf_deleter_type&> copy_buf(
+      RAJA::allocate_aligned_type<value_type>( RAJA::DATA_ALIGN, copylen * sizeof(value_type) ),
+      buf_deleter);
+
+  value_type* copyarr = copy_buf.get();
+
+  // check memory allocation worked
+  if (copyarr == nullptr) {
+    RAJA_ABORT_OR_THROW( "inplace_merge temporary memory allocation failed" );
+  }
+
+  // move construct input into buffer storage
+  // use buf_deleter.size as index to keep track of objects constructed
+  for ( diff_type& cc = buf_deleter.size; cc < copylen; ++cc )
+  {
+    new(&copyarr[cc]) value_type(std::move(first[cc]));
+  }
+
+  // merge
+  for ( diff_type cur = 0; cur < copylen; )
+  {
+    if ( middle >= last ) // moved all second half, put copy into remainder
+    {
+      std::move( copyarr+cur, copyarr+copylen, first );
+      break;
+    }
+    else if ( first == middle ) // everything prior to middle is sorted, done
+    {
+      break;
+    }
+
+    if ( comp(*middle, copyarr[cur]) )
+    {
+      *first = std::move(*middle);
+      ++middle;
+    }
+    else
+    {
+      *first = std::move(copyarr[cur]);
+      ++cur;
+    }
+    ++first;
+  }
+  return;
+}
+
+/*!
+    \brief merge given two ranges using comparison function
+    while copies are outside, somewhat follows STL API
+*/
+template <typename Iter1, typename Iter2, typename OutIter, typename Compare>
+//constexpr OutIter // <-- std:: return value
+void
+RAJA_INLINE 
+merge_like_std( Iter1 first1,
+                Iter1 last1,
+                Iter2 first2,
+                Iter2 last2,
+                OutIter d_first,  // using this as direct access to result
+                Compare comp)
+{
+  using ::RAJA::safe_iter_swap;
+
+  if ( first1 == last2 - 1 )  // should never need to do this
+  {
+    return;
+  }
+
+  if ( (last2 - first1) == 2 ) // only 2 elements, simple swap
+  {
+    if ( !comp(*d_first, *(d_first+1)) )
+    {
+      safe_iter_swap( d_first, d_first+1 );
+    }
+    return;
+  }
+
+  while ( first1 < last1 || first2 < last2 )
+  {
+    if ( first1 >= last1 ) // first half done
+    {
+      *d_first = std::move(*first2);
+      ++first2;
+    }
+    else if ( first2 >= last2 )  // second half done
+    {
+      *d_first = std::move(*first1);
+      ++first1;
+    }
+    else  // neither half done
+    {
+      if ( comp( *first2, *first1 ) )
+      {
+        *d_first = std::move(*first2);
+        ++first2;
+      }
+      else
+      {
+        *d_first = std::move(*first1);
+        ++first1;
+      }
+    }
+
+    ++d_first;  // advance output
+  }
+
+  return;
+}
+
+/*!
+    \brief stable merge sort given range inplace using comparison function
+    and using O(N*lg(N)) comparisons and O(N) memory
+*/
+template <typename Iter, typename Compare>
+RAJA_INLINE 
+void
+merge_sort(Iter begin,
+           Iter end,
+           Compare comp)
+{
+  using diff_type = RAJA::detail::IterDiff<Iter>;
+  using value_type = RAJA::detail::IterVal<Iter>;
+
+  // iterative mergesort (bottom up) for future parallelism
+
+  // min helper
+  auto minlam = [] (diff_type a, diff_type b) {return (a < b) ? a : b;};
+
+  // insertion sort for sizes <= 16
+  diff_type len = end - begin;
+  static constexpr diff_type insertion_sort_cutoff = 16;
+  if ( len <= insertion_sort_cutoff && len > 0 )
+  {
+    detail::insertion_sort( begin, end, comp );
+  }
+  else
+  {
+    // insertion sort on 16-element chunks, then merge
+    for ( diff_type start = 0; start < len; start += insertion_sort_cutoff )
+    {
+      diff_type lastchunk = minlam( insertion_sort_cutoff, len - start );
+      detail::insertion_sort( begin + start, begin + start + lastchunk, comp );
+    }
+
+    // merge using extra storage
+
+    // Manage the lifetime of the buffer and objects constructed in the buffer
+    using buf_deleter_type = FreeAlignedType<value_type, diff_type>;
+    buf_deleter_type buf_deleter;
+
+    std::unique_ptr<value_type, buf_deleter_type&> copy_buf(
+        RAJA::allocate_aligned_type<value_type>( RAJA::DATA_ALIGN, len * sizeof(value_type) ),
+        buf_deleter);
+
+    value_type* copyarr = copy_buf.get();
+
+    // check memory allocation worked
+    if (copyarr == nullptr) {
+      RAJA_ABORT_OR_THROW( "merge_sort temporary memory allocation failed" );
+    }
+
+    // move construct input into buffer storage
+    // use buf_deleter.size as index to keep track of objects constructed
+    for ( diff_type& cc = buf_deleter.size; cc < len; ++cc )
+    {
+      new(&copyarr[cc]) value_type(std::move(begin[cc]));
+    }
+
+    bool copyvalid = true;
+    //for ( diff_type midpoint = 1; midpoint < len; midpoint *= 2 )  // O(log n) loop
+    for ( diff_type midpoint = 16; midpoint < len; midpoint *= 2 )  // O(log n) loop
+    {
+      for ( diff_type start = 0; start < len; start += midpoint * 2 )  // O(n) merging loop (can be parallelized)
+      {
+        diff_type finish = minlam( start + midpoint * 2, len );
+        if ( finish > len )
+        {
+          RAJA_ABORT_OR_THROW( "merge_sort invalid finish point" );  // sanity check
+        }
+
+        if ( start + midpoint >= len )
+        {
+          // copy sorted remainder over
+          if ( copyvalid )
+          {
+            std::move( copyarr + start, copyarr + finish, begin + start );
+          }
+          else
+          {
+            std::move( begin + start, begin + finish, copyarr + start );
+          }
+          break;  // skip merge if no second half exists
+        }
+
+        if ( copyvalid )  // switch arrays per level of merging to avoid copying back to copyarr
+        {
+          detail::merge_like_std( copyarr + start, copyarr + start + midpoint, copyarr + start + midpoint, copyarr + finish, begin + start, comp );
+        }
+        else
+        {
+          detail::merge_like_std( begin + start, begin + start + midpoint, begin + start + midpoint, begin + finish, copyarr + start, comp );
+        }
+      }
+
+      copyvalid = !copyvalid; // switch arrays per level of merging to avoid copying back to copyarr
+    }
+
+    // update copy if necessary
+    if ( copyvalid )
+    {
+      std::move( copyarr, copyarr + len, begin );
+    }
+  }
+  //else
+  //{
+      // Possible TBD: in-place mergesort
+      // Would shift (like insertion sort) when performing merge.
+      // PRO - Can use on GPU, O(1) storage required.
+      // CON - Shifting would cause slowdown O(n^2 log n).
+  //}
+}
+
+}  // namespace detail
+
+/*!
+    \brief stable insertion sort given range inplace using comparison function
+    and using O(N^2) comparisons and O(1) memory
+*/
+template <typename Iter,
+          typename Compare = operators::less<detail::IterVal<Iter>>>
+RAJA_HOST_DEVICE RAJA_INLINE
+concepts::enable_if<type_traits::is_iterator<Iter>>
+insertion_sort(Iter begin,
+               Iter end,
+               Compare comp = Compare{})
+{
+  auto N = end - begin;
+
+  if (N > 1) {
+
+    detail::insertion_sort(begin, end, comp);
+  }
+}
+
+/*!
+    \brief unstable shell sort given range inplace using comparison function
+    and using O(N^?) comparisons and O(1) memory
+*/
+template <typename Iter,
+          typename Compare = operators::less<detail::IterVal<Iter>>>
+RAJA_HOST_DEVICE RAJA_INLINE
+concepts::enable_if<type_traits::is_iterator<Iter>>
+shell_sort(Iter begin,
+               Iter end,
+               Compare comp = Compare{})
+{
+  auto N = end - begin;
+
+  if (N > 1) {
+
+    detail::shell_sort(begin, end, comp);
+  }
+}
+
+/*!
+    \brief unstable heap sort given range inplace using comparison function
+    and using O(N*lg(N)) comparisons and O(1) memory
+*/
+template <typename Iter,
+          typename Compare = operators::less<detail::IterVal<Iter>>>
+RAJA_HOST_DEVICE RAJA_INLINE
+concepts::enable_if<type_traits::is_iterator<Iter>>
+heap_sort(Iter begin,
+          Iter end,
+          Compare comp = Compare{})
+{
+  auto N = end - begin;
+
+  if (N > 1) {
+
+    detail::heap_sort(begin, end, comp);
+  }
+}
+
+/*!
+    \brief unstable intro sort given range inplace using comparison function
+    and using O(N*lg(N)) comparisons and O(lg(N)) memory
+*/
+template <typename Iter,
+          typename Compare = operators::less<detail::IterVal<Iter>>>
+RAJA_HOST_DEVICE RAJA_INLINE
+concepts::enable_if<type_traits::is_iterator<Iter>>
+intro_sort(Iter begin,
+           Iter end,
+           Compare comp = Compare{})
+{
+  auto N = end - begin;
+
+  if (N > 1) {
+
+    // set max depth to 2*lg(N)
+    unsigned max_depth = 2*detail::ulog2(N);
+
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+    // limit max_depth statically in device code to allow compiler to remove recursion
+    if (max_depth > detail::intro_sort_device_max_depth::get()) {
+      max_depth = detail::intro_sort_device_max_depth::get();
+    }
+#endif
+
+    detail::intro_sort(begin, end, comp, max_depth);
+  }
+}
+
+/*!
+    \brief stable merge sort given range inplace using comparison function
+    and using O(N*lg(N)) comparisons and O(N) memory
+*/
+template <typename Iter,
+          typename Compare = operators::less<detail::IterVal<Iter>>>
+RAJA_INLINE
+concepts::enable_if<type_traits::is_iterator<Iter>>
+merge_sort(Iter begin,
+           Iter end,
+           Compare comp = Compare{})
+{
+  auto N = end - begin;
+
+  if (N > 1) {
+
+    detail::merge_sort(begin, end, comp);
+  }
+}
+
+}  // namespace RAJA
+
+#endif
diff --git a/include/RAJA/util/zip.hpp b/include/RAJA/util/zip.hpp
new file mode 100644
index 0000000000..cca235634d
--- /dev/null
+++ b/include/RAJA/util/zip.hpp
@@ -0,0 +1,254 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file for multi-iterator Zip Views.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+
+#ifndef RAJA_util_zip_HPP
+#define RAJA_util_zip_HPP
+
+#include "RAJA/config.hpp"
+
+#include <iostream>
+#include <type_traits>
+
+#include "RAJA/pattern/detail/algorithm.hpp"
+#include "RAJA/util/camp_aliases.hpp"
+#include "RAJA/util/concepts.hpp"
+#include "RAJA/util/zip_tuple.hpp"
+
+namespace RAJA
+{
+
+/*!
+    \brief ZipIterator class for simultaneously iterating over
+    multiple iterators. This is not a standards compliant iterator.
+*/
+template < typename ... Iters >
+struct ZipIterator
+{
+  static_assert(concepts::all_of<type_traits::is_random_access_iterator<Iters>...>::value,
+      "ZipIterator can only contain random access iterators");
+  static_assert(sizeof...(Iters) > 1,
+      "ZipIterator must contain one or more iterators");
+
+  using value_type = zip_val<typename std::iterator_traits<Iters>::value_type...>;
+  using difference_type = std::ptrdiff_t;
+  using pointer = void;
+  using reference = zip_ref<typename std::iterator_traits<Iters>::reference...>;
+  using creference = zip_ref<const typename std::iterator_traits<Iters>::reference...>;
+  using iterator_category = std::random_access_iterator_tag;
+
+  RAJA_HOST_DEVICE inline ZipIterator()
+    : m_iterators()
+  {
+  }
+
+  template < typename... Args,
+             typename = concepts::enable_if<type_traits::convertible_to<Args&&, Iters>...> >
+  RAJA_HOST_DEVICE inline ZipIterator(Args&&... args)
+    : m_iterators(std::forward<Args>(args)...)
+  {
+  }
+
+  RAJA_HOST_DEVICE inline ZipIterator(const ZipIterator& rhs)
+    : m_iterators(rhs.m_iterators)
+  {
+  }
+  RAJA_HOST_DEVICE inline ZipIterator(ZipIterator&& rhs)
+    : m_iterators(std::move(rhs.m_iterators))
+  {
+  }
+
+  RAJA_HOST_DEVICE inline ZipIterator& operator=(const ZipIterator& rhs)
+  {
+    m_iterators = rhs.m_iterators;
+    return *this;
+  }
+  RAJA_HOST_DEVICE inline ZipIterator& operator=(ZipIterator&& rhs)
+  {
+    m_iterators = std::move(rhs.m_iterators);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE inline difference_type get_stride() const { return 1; }
+
+  RAJA_HOST_DEVICE inline bool operator==(const ZipIterator& rhs) const
+  {
+    return RAJA::get<0>(m_iterators) == RAJA::get<0>(rhs.m_iterators);
+  }
+  RAJA_HOST_DEVICE inline bool operator!=(const ZipIterator& rhs) const
+  {
+    return RAJA::get<0>(m_iterators) != RAJA::get<0>(rhs.m_iterators);
+  }
+  RAJA_HOST_DEVICE inline bool operator>(const ZipIterator& rhs) const
+  {
+    return RAJA::get<0>(m_iterators) >  RAJA::get<0>(rhs.m_iterators);
+  }
+  RAJA_HOST_DEVICE inline bool operator<(const ZipIterator& rhs) const
+  {
+    return RAJA::get<0>(m_iterators) <  RAJA::get<0>(rhs.m_iterators);
+  }
+  RAJA_HOST_DEVICE inline bool operator>=(const ZipIterator& rhs) const
+  {
+    return RAJA::get<0>(m_iterators) >= RAJA::get<0>(rhs.m_iterators);
+  }
+  RAJA_HOST_DEVICE inline bool operator<=(const ZipIterator& rhs) const
+  {
+    return RAJA::get<0>(m_iterators) <= RAJA::get<0>(rhs.m_iterators);
+  }
+
+  RAJA_HOST_DEVICE inline ZipIterator& operator++()
+  {
+    detail::zip_for_each(m_iterators, detail::PreInc{});
+    return *this;
+  }
+  RAJA_HOST_DEVICE inline ZipIterator& operator--()
+  {
+    detail::zip_for_each(m_iterators, detail::PreDec{});
+    return *this;
+  }
+  RAJA_HOST_DEVICE inline ZipIterator operator++(int)
+  {
+    ZipIterator tmp(*this);
+    ++(*this);
+    return tmp;
+  }
+  RAJA_HOST_DEVICE inline ZipIterator operator--(int)
+  {
+    ZipIterator tmp(*this);
+    --(*this);
+    return tmp;
+  }
+
+  RAJA_HOST_DEVICE inline ZipIterator& operator+=(
+      const difference_type& rhs)
+  {
+    detail::zip_for_each(m_iterators, detail::PlusEq<difference_type>{rhs});
+    return *this;
+  }
+  RAJA_HOST_DEVICE inline ZipIterator& operator-=(
+      const difference_type& rhs)
+  {
+    detail::zip_for_each(m_iterators, detail::MinusEq<difference_type>{rhs});
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE inline difference_type operator-(
+      const ZipIterator& rhs) const
+  {
+    return RAJA::get<0>(m_iterators) - RAJA::get<0>(rhs.m_iterators);
+  }
+  RAJA_HOST_DEVICE inline ZipIterator operator+(
+      const difference_type& rhs) const
+  {
+    ZipIterator tmp(*this);
+    tmp += rhs;
+    return tmp;
+  }
+  RAJA_HOST_DEVICE inline ZipIterator operator-(
+      const difference_type& rhs) const
+  {
+    ZipIterator tmp(*this);
+    tmp -= rhs;
+    return tmp;
+  }
+  RAJA_HOST_DEVICE friend ZipIterator operator+(
+      difference_type lhs,
+      const ZipIterator& rhs)
+  {
+    ZipIterator tmp(rhs);
+    tmp += lhs;
+    return tmp;
+  }
+
+  RAJA_HOST_DEVICE inline reference operator*() const
+  {
+    return deref_helper(camp::make_idx_seq_t<sizeof...(Iters)>{});
+  }
+  // TODO:: figure out what to do with this
+  // RAJA_HOST_DEVICE inline reference operator->() const
+  // {
+  //   return *(*this);
+  // }
+  RAJA_HOST_DEVICE reference operator[](difference_type rhs) const
+  {
+    return *((*this) + rhs);
+  }
+
+  RAJA_HOST_DEVICE friend inline void safe_iter_swap(ZipIterator lhs, ZipIterator rhs)
+  {
+    detail::zip_for_each(lhs.m_iterators, rhs.m_iterators, detail::IterSwap{});
+  }
+
+private:
+  zip_val<camp::decay<Iters>...> m_iterators;
+
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE inline reference deref_helper(camp::idx_seq<Is...>) const
+  {
+    return reference(*RAJA::get<Is>(m_iterators)...);
+  }
+};
+
+
+/*!
+    \brief Zip multiple iterators together to iterate them simultaneously with
+    a single ZipIterator object.
+*/
+template < typename... Args >
+RAJA_HOST_DEVICE
+auto zip(Args&&... args)
+  -> ZipIterator<camp::decay<Args>...>
+{
+  return {std::forward<Args>(args)...};
+}
+
+/*!
+    \brief Comparator object that compares the first member
+    of tuple like objects.
+*/
+template < typename T, typename Compare >
+struct CompareFirst
+{
+  RAJA_HOST_DEVICE inline CompareFirst(Compare comp_)
+    : comp(comp_)
+  { }
+
+  RAJA_HOST_DEVICE inline bool operator()(T const& lhs, T const& rhs)
+  {
+    return comp(RAJA::get<0>(lhs), RAJA::get<0>(rhs));
+  }
+
+private:
+  Compare comp;
+};
+
+/*!
+    \brief Make a comparator to compare first member of tuple
+    like objects of type T.
+*/
+template < typename T, typename Compare >
+RAJA_HOST_DEVICE
+auto compare_first(Compare comp)
+  -> CompareFirst<T, Compare>
+{
+  return {comp};
+}
+
+}  // end namespace RAJA
+
+#endif
diff --git a/include/RAJA/util/zip_tuple.hpp b/include/RAJA/util/zip_tuple.hpp
new file mode 100644
index 0000000000..7f4a14daa8
--- /dev/null
+++ b/include/RAJA/util/zip_tuple.hpp
@@ -0,0 +1,367 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file for multi-iterator Zip Views.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+
+#ifndef RAJA_util_zip_ref_HPP
+#define RAJA_util_zip_ref_HPP
+
+#include "RAJA/config.hpp"
+
+#include <iostream>
+#include <type_traits>
+
+#include "RAJA/pattern/detail/algorithm.hpp"
+#include "RAJA/util/camp_aliases.hpp"
+#include "RAJA/util/concepts.hpp"
+
+namespace RAJA
+{
+
+template < bool is_val, typename ... Ts >
+struct zip_tuple;
+
+template < camp::idx_t I, bool is_val, typename ... Ts >
+struct tuple_element<I, zip_tuple<is_val, Ts...>>
+  : camp::tuple_element<I, typename zip_tuple<is_val, Ts...>::value_type>
+{ };
+
+// get function declarations for zip_tuple
+// the reference type returned by get depends on the reference type
+// of the zip_tuple that get is called on
+template < camp::idx_t I, bool is_val, typename ... Ts >
+RAJA_HOST_DEVICE RAJA_INLINE                                RAJA::tuple_element_t<I, zip_tuple<is_val, Ts...>>         &
+get(zip_tuple<is_val, Ts...>      & z)
+{ return           z .template get<I>(); }
+template < camp::idx_t I, bool is_val, typename ... Ts >
+RAJA_HOST_DEVICE RAJA_INLINE                                RAJA::tuple_element_t<I, zip_tuple<is_val, Ts...>>    const&
+get(zip_tuple<is_val, Ts...> const& z)
+{ return           z .template get<I>(); }
+template < camp::idx_t I, bool is_val, typename ... Ts >
+RAJA_HOST_DEVICE RAJA_INLINE typename std::remove_reference<RAJA::tuple_element_t<I, zip_tuple<is_val, Ts...>>>::type &&
+get(zip_tuple<is_val, Ts...>     && z)
+{ return std::move(z).template get<I>(); }
+
+namespace detail
+{
+
+struct PassThrough
+{
+  template < typename T >
+  RAJA_HOST_DEVICE RAJA_INLINE auto operator()(T&& t) const
+    -> decltype(std::forward<T>(t))
+  {
+    return std::forward<T>(t);
+  }
+};
+
+struct Move
+{
+  template < typename T >
+  RAJA_HOST_DEVICE RAJA_INLINE auto operator()(T&& t) const
+    -> decltype(std::move(t))
+  {
+    return std::move(t);
+  }
+};
+
+struct PreInc
+{
+  template< typename Iter >
+  RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
+    -> decltype(++std::forward<Iter>(iter))
+  {
+    return ++std::forward<Iter>(iter);
+  }
+};
+
+struct PreDec
+{
+  template< typename Iter >
+  RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
+    -> decltype(--std::forward<Iter>(iter))
+  {
+    return --std::forward<Iter>(iter);
+  }
+};
+
+template < typename difference_type >
+struct PlusEq
+{
+  const difference_type& rhs;
+  template< typename Iter >
+  RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
+    -> decltype(std::forward<Iter>(iter) += rhs)
+  {
+    return std::forward<Iter>(iter) += rhs;
+  }
+};
+
+template < typename difference_type >
+struct MinusEq
+{
+  const difference_type& rhs;
+  template< typename Iter >
+  RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
+    -> decltype(std::forward<Iter>(iter) -= rhs)
+  {
+    return std::forward<Iter>(iter) -= rhs;
+  }
+};
+
+struct DeRef
+{
+  template< typename Iter >
+  RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
+    -> decltype(*std::forward<Iter>(iter))
+  {
+    return *std::forward<Iter>(iter);
+  }
+};
+
+struct Swap
+{
+  template< typename T0, typename T1 >
+  RAJA_HOST_DEVICE inline int operator()(T0&& t0, T1&& t1) const
+  {
+    using camp::safe_swap;
+    safe_swap(std::forward<T0>(t0), std::forward<T1>(t1));
+    return 1;
+  }
+};
+
+struct IterSwap
+{
+  template< typename T0, typename T1 >
+  RAJA_HOST_DEVICE inline int operator()(T0&& t0, T1&& t1) const
+  {
+    using RAJA::safe_iter_swap;
+    safe_iter_swap(std::forward<T0>(t0), std::forward<T1>(t1));
+    return 1;
+  }
+};
+
+
+/*!
+    \brief Call f on each member of t (f(t)...).
+*/
+template < typename Tuple, typename F, camp::idx_t... Is >
+RAJA_HOST_DEVICE inline
+void zip_for_each_impl(Tuple&& t, F&& f, camp::idx_seq<Is...>)
+{
+  camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple>(t)))...);
+}
+
+/*!
+    \brief Call f on each member of t0 and t1 (f(t0, t1)...).
+*/
+template < typename Tuple0, typename Tuple1, typename F, camp::idx_t... Is >
+RAJA_HOST_DEVICE inline
+void zip_for_each_impl(Tuple0&& t0, Tuple1&& t1, F&& f, camp::idx_seq<Is...>)
+{
+  camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple0>(t0)), RAJA::get<Is>(std::forward<Tuple1>(t1)))...);
+}
+
+/*!
+    \brief Call f on each member of t (f(t)...).
+*/
+template < typename Tuple, typename F >
+RAJA_HOST_DEVICE inline
+void zip_for_each(Tuple&& t, F&& f)
+{
+  zip_for_each_impl(std::forward<Tuple>(t), std::forward<F>(f), typename camp::decay<Tuple>::IdxSeq{});
+}
+
+/*!
+    \brief Call f on each member of t0 and t1 (f(t0, t1)...).
+*/
+template < typename Tuple0, typename Tuple1, typename F >
+RAJA_HOST_DEVICE inline
+void zip_for_each(Tuple0&& t0, Tuple1&& t1, F&& f)
+{
+  static_assert(std::is_same<typename camp::decay<Tuple0>::IdxSeq, typename camp::decay<Tuple1>::IdxSeq>::value,
+      "Tuple0 and Tuple1 must have the same size");
+  zip_for_each_impl(std::forward<Tuple0>(t0), std::forward<Tuple1>(t1), std::forward<F>(f), typename camp::decay<Tuple0>::IdxSeq{});
+}
+
+} // end namespace detail
+
+/*!
+    \brief Tuple used by ZipIterator for storing multiple references and values.
+    Acts like a reference to its members allowing copy/move construction/assignment
+    based on the reference type of the zip_tuple.
+*/
+template < bool is_val, typename ... Ts >
+struct zip_tuple
+{
+  using value_type = RAJA::tuple<Ts...>;
+
+  template < typename T >
+  using opp_type = typename std::conditional< is_val,
+        typename std::add_lvalue_reference<T>::type,
+        typename std::remove_reference<T>::type >::type;
+
+  // zip_tuple type with opposite is_val
+  using opp_tuple = zip_tuple<!is_val, opp_type<Ts>...>;
+
+  // camp::idx_seq for this type, also used by zip_for_each
+  using IdxSeq = camp::make_idx_seq_t<sizeof...(Ts)>;
+
+  // constructor from types convertible to Ts
+  template < typename ... Os
+           , typename = concepts::enable_if<type_traits::convertible_to<Os&&, Ts>...> >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(Os&&... os)
+    : m_tuple(std::forward<Os>(os)...) { }
+
+  // assignment from types convertible to Ts
+  template < typename ... Os
+           , typename = concepts::enable_if<type_traits::convertible_to<Os&&, typename std::remove_reference<Ts>::type>...> >
+  zip_tuple& assign(Os&&... os)
+  { return assign_helper(IdxSeq{}, std::forward<Os>(os)...); }
+
+  // copy and move constructors
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &      o)
+    : zip_tuple(          o , IdxSeq{}) { }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o)
+    : zip_tuple(          o , IdxSeq{}) { }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &&     o)
+    : zip_tuple(std::move(o), IdxSeq{}) { } // move if is_val, pass-through otherwise
+
+  // copy and move assignment operators
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple &      o)
+  { return assign_helper(          o , IdxSeq{}); }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple const& o)
+  { return assign_helper(          o , IdxSeq{}); }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple &&     o)
+  { return assign_helper(std::move(o), IdxSeq{}); }
+
+  // copy and move constructors from opp_tuple type zip_tuples
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &      o)
+    : zip_tuple(          o , IdxSeq{}) { }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o)
+    : zip_tuple(          o , IdxSeq{}) { }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &&     o)
+    : zip_tuple(std::move(o), IdxSeq{}) { } // move if is_val, pass-through otherwise
+
+  // copy and move assignment operators from opp_tuple type zip_tuples
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple &      o)
+  { return assign_helper(          o , IdxSeq{}); }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple const& o)
+  { return assign_helper(          o , IdxSeq{}); }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple &&     o)
+  { return assign_helper(std::move(o), IdxSeq{}); }
+
+  // get member functions for zip_tuples
+  // the reference type returned by get depends on the reference type
+  // of the zip_tuple that get is called on
+  template < camp::idx_t I >
+  RAJA_HOST_DEVICE RAJA_INLINE RAJA::tuple_element_t<I, value_type> & get() &
+  { return RAJA::get<I>(m_tuple); }
+  template < camp::idx_t I >
+  RAJA_HOST_DEVICE RAJA_INLINE RAJA::tuple_element_t<I, value_type> const& get() const&
+  { return RAJA::get<I>(m_tuple); }
+  template < camp::idx_t I >
+  RAJA_HOST_DEVICE RAJA_INLINE typename std::remove_reference<RAJA::tuple_element_t<I, value_type>>::type && get() &&
+  { return std::move(RAJA::get<I>(m_tuple)); }
+
+  // safe_swap that calls swap on each pair in the tuple
+  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs, zip_tuple& rhs)
+  {
+    detail::zip_for_each(lhs, rhs, detail::Swap{});
+  }
+
+  // safe_swap for swapping zip_tuples with opposite is_val
+  // calls swap on each pair in the tuple
+  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs, opp_tuple& rhs)
+  {
+    detail::zip_for_each(lhs, rhs, detail::Swap{});
+  }
+
+  // allow printing of zip_tuples by printing value_type
+  friend inline std::ostream& operator<<(std::ostream& o, zip_tuple const& v)
+  {
+    return o << v.m_tuple;
+  }
+
+private:
+  // move if is_val is true, otherwise copy in move constructor
+  // this allows values to be moved, and references to stay lvalue references
+  using IsValMover = typename std::conditional<is_val, detail::Move, detail::PassThrough>::type;
+
+  value_type m_tuple;
+
+  // assignment helper from types convertible to Ts
+  template < typename ... Os, camp::idx_t ... Is >
+  zip_tuple& assign_helper(camp::idx_seq<Is...>, Os&&... os)
+  { camp::sink(get<Is>() = std::forward<Os>(os)...); return *this; }
+
+  // copy and move constructor helpers
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &      o, camp::idx_seq<Is...>)
+    : zip_tuple(RAJA::get<Is>(             o )...) { }
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o, camp::idx_seq<Is...>)
+    : zip_tuple(RAJA::get<Is>(             o )...) { }
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &&     o, camp::idx_seq<Is...>)
+    : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...) { } // move if is_val, pass-through otherwise
+
+  // copy and move assignment operator helpers
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple &      o, camp::idx_seq<Is...>)
+  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); } return *this; }
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple const& o, camp::idx_seq<Is...>)
+  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); } return *this; }
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple &&     o, camp::idx_seq<Is...>)
+  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...); } return *this; }
+
+  // copy and move constructor helpers from opp_tuple type zip_tuples
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &      o, camp::idx_seq<Is...>)
+    : zip_tuple(RAJA::get<Is>(             o )...) { }
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o, camp::idx_seq<Is...>)
+    : zip_tuple(RAJA::get<Is>(             o )...) { }
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &&     o, camp::idx_seq<Is...>)
+    : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...) { } // move if is_val, pass-through otherwise
+
+  // copy and move assignment operator helpers from opp_tuple type zip_tuples
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple &      o, camp::idx_seq<Is...>)
+  { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); return *this; }
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple const& o, camp::idx_seq<Is...>)
+  { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); return *this; }
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple &&     o, camp::idx_seq<Is...>)
+  { camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...); return *this; }
+
+};
+
+// alias zip_ref to zip_tuple capable of storing references (!is_val)
+template < typename ... Ts >
+using zip_ref = zip_tuple<false, Ts...>;
+
+// alias zip_val to zip_tuple suitable for storing values (is_val)
+template < typename ... Ts >
+using zip_val = zip_tuple<true, Ts...>;
+
+}  // end namespace RAJA
+
+#endif
diff --git a/scripts/lc-builds/blueos_clang-ibm-2019.10.03_omptarget.sh b/scripts/lc-builds/blueos_clang-ibm-2019.10.03_omptarget.sh
new file mode 100755
index 0000000000..aeb1cb842d
--- /dev/null
+++ b/scripts/lc-builds/blueos_clang-ibm-2019.10.03_omptarget.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+BUILD_SUFFIX=lc_blueos-clang-ibm-2019.10.03_omptarget
+
+rm -rf build_${BUILD_SUFFIX} >/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.14.5
+
+## NOTE: RAJA tests are turned off due to compilation issues.
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-ibm-2019.10.03/bin/clang++ \
+  -C ../host-configs/lc-builds/blueos/clang_X.cmake \
+  -DENABLE_OPENMP=On \
+  -DENABLE_CUDA=Off \
+  -DENABLE_TARGET_OPENMP=On \
+  -DOpenMP_CXX_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda" \
+  -DENABLE_ALL_WARNINGS=Off \
+  -DENABLE_EXAMPLES=Off \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
diff --git a/scripts/lc-builds/blueos_clang10.0.0.sh b/scripts/lc-builds/blueos_clang10.0.0.sh
new file mode 100755
index 0000000000..ebf2fbacd3
--- /dev/null
+++ b/scripts/lc-builds/blueos_clang10.0.0.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+BUILD_SUFFIX=lc_blueos-clang-10.0.0
+
+rm -rf build_${BUILD_SUFFIX} 2>/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.14.5
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-10.0.0/bin/clang++ \
+  -C ../host-configs/lc-builds/blueos/clang_X.cmake \
+  -DENABLE_OPENMP=On \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  .. 
diff --git a/scripts/lc-builds/blueos_nvcc10_xl-2019.06.12.sh b/scripts/lc-builds/blueos_nvcc10_clang10.0.0.sh
similarity index 63%
rename from scripts/lc-builds/blueos_nvcc10_xl-2019.06.12.sh
rename to scripts/lc-builds/blueos_nvcc10_clang10.0.0.sh
index e670a85346..a1f6dd55f0 100755
--- a/scripts/lc-builds/blueos_nvcc10_xl-2019.06.12.sh
+++ b/scripts/lc-builds/blueos_nvcc10_clang10.0.0.sh
@@ -7,21 +7,21 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-BUILD_SUFFIX=lc_blueos-nvcc10-xl_2019.06.12
+BUILD_SUFFIX=lc_blueos-nvcc10-clang10.0.0
 
-rm -rf build_${BUILD_SUFFIX} 2>/dev/null
+rm -rf build_${BUILD_SUFFIX} >/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
 module load cmake/3.14.5
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
-  -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2019.06.12/bin/xlc++_r \
-  -C ../host-configs/lc-builds/blueos/nvcc_xl_2019_X.cmake \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-10.0.0/bin/clang++ \
+  -C ../host-configs/lc-builds/blueos/nvcc_clang_X.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
-  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-10.1.243 \
-  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-10.1.243/bin/nvcc \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-10.2.89 \
+  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-10.2.89/bin/nvcc \
   -DCUDA_ARCH=sm_70 \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
diff --git a/scripts/lc-builds/blueos_nvcc10_clang9.0.0.sh b/scripts/lc-builds/blueos_nvcc10_clang9.0.0.sh
index 80a95e0fa2..1777ea17e5 100755
--- a/scripts/lc-builds/blueos_nvcc10_clang9.0.0.sh
+++ b/scripts/lc-builds/blueos_nvcc10_clang9.0.0.sh
@@ -20,8 +20,8 @@ cmake \
   -C ../host-configs/lc-builds/blueos/nvcc_clang_X.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
-  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-10.1.243 \
-  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-10.1.243/bin/nvcc \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-10.2.89 \
+  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-10.2.89/bin/nvcc \
   -DCUDA_ARCH=sm_70 \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
diff --git a/scripts/lc-builds/blueos_nvcc10_gcc8.3.1.sh b/scripts/lc-builds/blueos_nvcc10_gcc8.3.1.sh
index 3b3ad8586c..641b3daac7 100755
--- a/scripts/lc-builds/blueos_nvcc10_gcc8.3.1.sh
+++ b/scripts/lc-builds/blueos_nvcc10_gcc8.3.1.sh
@@ -20,8 +20,8 @@ cmake \
   -C ../host-configs/lc-builds/blueos/nvcc_gcc_X.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
-  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-10.1.243 \
-  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-10.1.243/bin/nvcc \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-10.2.89 \
+  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-10.2.89/bin/nvcc \
   -DCUDA_ARCH=sm_70 \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
diff --git a/scripts/lc-builds/blueos_nvcc10_xl-2019.08.20.sh b/scripts/lc-builds/blueos_nvcc10_xl-2019.08.20.sh
index 82deba816d..aff6b69508 100755
--- a/scripts/lc-builds/blueos_nvcc10_xl-2019.08.20.sh
+++ b/scripts/lc-builds/blueos_nvcc10_xl-2019.08.20.sh
@@ -17,7 +17,7 @@ module load cmake/3.14.5
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2019.08.20/bin/xlc++_r \
-  -C ../host-configs/lc-builds/blueos/nvcc_xl_2019_X.cmake \
+  -C ../host-configs/lc-builds/blueos/nvcc_xl_X.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
   -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-10.1.243 \
diff --git a/scripts/lc-builds/blueos_nvcc10_xl-2019.12.23.sh b/scripts/lc-builds/blueos_nvcc10_xl-2019.12.23.sh
index cf6d5204f3..a675dd2842 100755
--- a/scripts/lc-builds/blueos_nvcc10_xl-2019.12.23.sh
+++ b/scripts/lc-builds/blueos_nvcc10_xl-2019.12.23.sh
@@ -17,11 +17,11 @@ module load cmake/3.14.5
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2019.12.23/bin/xlc++_r \
-  -C ../host-configs/lc-builds/blueos/nvcc_xl_2019_X.cmake \
+  -C ../host-configs/lc-builds/blueos/nvcc_xl_X.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
-  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-10.1.243 \
-  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-10.1.243/bin/nvcc \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-10.2.89 \
+  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-10.2.89/bin/nvcc \
   -DCUDA_ARCH=sm_70 \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
diff --git a/scripts/lc-builds/blueos_nvcc9_xl-2019.04.19.sh b/scripts/lc-builds/blueos_nvcc10_xl-2020.06.25.sh
similarity index 69%
rename from scripts/lc-builds/blueos_nvcc9_xl-2019.04.19.sh
rename to scripts/lc-builds/blueos_nvcc10_xl-2020.06.25.sh
index 21aa733da0..7a4a8d5318 100755
--- a/scripts/lc-builds/blueos_nvcc9_xl-2019.04.19.sh
+++ b/scripts/lc-builds/blueos_nvcc10_xl-2020.06.25.sh
@@ -7,7 +7,7 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-BUILD_SUFFIX=lc_blueos-nvcc9-xl_2019.04.19
+BUILD_SUFFIX=lc_blueos-nvcc10-xl_2020.06.25
 
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
@@ -16,12 +16,12 @@ module load cmake/3.14.5
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
-  -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2019.04.19/bin/xlc++_r \
-  -C ../host-configs/lc-builds/blueos/nvcc_xl_2019_X.cmake \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2020.06.25/bin/xlc++_r \
+  -C ../host-configs/lc-builds/blueos/nvcc_xl_X.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
-  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-9.2.148 \
-  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-9.2.148/bin/nvcc \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-10.2.89 \
+  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-10.2.89/bin/nvcc \
   -DCUDA_ARCH=sm_70 \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
diff --git a/scripts/lc-builds/blueos_nvcc11_clang9.0.0.sh b/scripts/lc-builds/blueos_nvcc11_clang9.0.0.sh
new file mode 100755
index 0000000000..84501ec375
--- /dev/null
+++ b/scripts/lc-builds/blueos_nvcc11_clang9.0.0.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+## NOTE: After building code, you need to load the cuda 11 module to run
+##       your code or RAJA tests
+#
+
+BUILD_SUFFIX=lc_blueos-nvcc11-clang9.0.0
+
+rm -rf build_${BUILD_SUFFIX} >/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.14.5
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-9.0.0/bin/clang++ \
+  -C ../host-configs/lc-builds/blueos/nvcc_clang_X.cmake \
+  -DENABLE_OPENMP=On \
+  -DENABLE_CUDA=On \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-11.0.2 \
+  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-11.0.2/bin/nvcc \
+  -DCUDA_ARCH=sm_70 \
+  -DCMAKE_CUDA_STANDARD="14" \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
diff --git a/scripts/lc-builds/blueos_nvcc11_gcc8.3.1.sh b/scripts/lc-builds/blueos_nvcc11_gcc8.3.1.sh
new file mode 100755
index 0000000000..f68b3ec6dd
--- /dev/null
+++ b/scripts/lc-builds/blueos_nvcc11_gcc8.3.1.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+## NOTE: After building code, you need to load the cuda 11 module to run
+##       your code or RAJA tests
+#
+
+BUILD_SUFFIX=lc_blueos-nvcc11-gcc8.3.1
+
+rm -rf build_${BUILD_SUFFIX} >/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.14.5
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-8.3.1/bin/g++ \
+  -C ../host-configs/lc-builds/blueos/nvcc_gcc_X.cmake \
+  -DENABLE_OPENMP=On \
+  -DENABLE_CUDA=On \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-11.0.182 \
+  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-11.0.182/bin/nvcc \
+  -DCUDA_ARCH=sm_70 \
+  -DCMAKE_CUDA_STANDARD="14" \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  -DCMAKE_EXPORT_COMPILE_COMMANDS=Off \
+  -DCMAKE_VERBOSE_MAKEFILE=Off \
+  "$@" \
+  ..
diff --git a/scripts/lc-builds/blueos_nvcc10_xl-2019.04.19.sh b/scripts/lc-builds/blueos_nvcc11_xl-2020.08.24.sh
similarity index 60%
rename from scripts/lc-builds/blueos_nvcc10_xl-2019.04.19.sh
rename to scripts/lc-builds/blueos_nvcc11_xl-2020.08.24.sh
index 3c4fa72dfb..b3a7492586 100755
--- a/scripts/lc-builds/blueos_nvcc10_xl-2019.04.19.sh
+++ b/scripts/lc-builds/blueos_nvcc11_xl-2020.08.24.sh
@@ -7,7 +7,12 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-BUILD_SUFFIX=lc_blueos-nvcc10-xl_2019.04.19
+#
+## NOTE: After building code, you need to load the cuda 11 module to run
+##       your code or RAJA tests
+#
+
+BUILD_SUFFIX=lc_blueos-nvcc11-xl_2020.08.24
 
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
@@ -16,13 +21,14 @@ module load cmake/3.14.5
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
-  -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2019.04.19/bin/xlc++_r \
-  -C ../host-configs/lc-builds/blueos/nvcc_xl_2019_X.cmake \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2020.08.24/bin/xlc++_r \
+  -C ../host-configs/lc-builds/blueos/nvcc_xl_X.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
-  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-10.1.243 \
-  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-10.1.243/bin/nvcc \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-11.0.2 \
+  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-11.0.2/bin/nvcc \
   -DCUDA_ARCH=sm_70 \
+  -DCMAKE_CUDA_STANDARD="14" \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
   ..
diff --git a/scripts/lc-builds/blueos_nvcc9_xl-2019.06.12.sh b/scripts/lc-builds/blueos_nvcc9_xl-2019.06.12.sh
index 789371782c..59e891835b 100755
--- a/scripts/lc-builds/blueos_nvcc9_xl-2019.06.12.sh
+++ b/scripts/lc-builds/blueos_nvcc9_xl-2019.06.12.sh
@@ -17,7 +17,7 @@ module load cmake/3.14.5
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2019.06.12/bin/xlc++_r \
-  -C ../host-configs/lc-builds/blueos/nvcc_xl_2019_X.cmake \
+  -C ../host-configs/lc-builds/blueos/nvcc_xl_X.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
   -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-9.2.148 \
diff --git a/scripts/lc-builds/blueos_xl-2019.08.20.sh b/scripts/lc-builds/blueos_xl-2019.08.20.sh
index 90e600c1da..66e80da591 100755
--- a/scripts/lc-builds/blueos_xl-2019.08.20.sh
+++ b/scripts/lc-builds/blueos_xl-2019.08.20.sh
@@ -17,7 +17,7 @@ module load cmake/3.14.5
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2019.08.20/bin/xlc++_r \
-  -C ../host-configs/lc-builds/blueos/xl_2019_X.cmake \
+  -C ../host-configs/lc-builds/blueos/xl_X.cmake \
   -DENABLE_OPENMP=On \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
diff --git a/scripts/lc-builds/blueos_xl-2019.08.20_omptarget.sh b/scripts/lc-builds/blueos_xl-2019.08.20_omptarget.sh
index 4bbbc78139..c654bc386e 100755
--- a/scripts/lc-builds/blueos_xl-2019.08.20_omptarget.sh
+++ b/scripts/lc-builds/blueos_xl-2019.08.20_omptarget.sh
@@ -17,7 +17,7 @@ module load cmake/3.14.5
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2019.08.20/bin/xlc++_r \
-  -C ../host-configs/lc-builds/blueos/xl_2019_X.cmake \
+  -C ../host-configs/lc-builds/blueos/xl_X.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_TARGET_OPENMP=On \
   -DOpenMP_CXX_FLAGS="-qoffload;-qsmp=omp;-qnoeh;-qalias=noansi" \
diff --git a/scripts/lc-builds/blueos_xl-2019.12.23.sh b/scripts/lc-builds/blueos_xl-2019.12.23.sh
index efb52ab41e..c8dc42a8a0 100755
--- a/scripts/lc-builds/blueos_xl-2019.12.23.sh
+++ b/scripts/lc-builds/blueos_xl-2019.12.23.sh
@@ -17,7 +17,7 @@ module load cmake/3.14.5
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2019.12.23/bin/xlc++_r \
-  -C ../host-configs/lc-builds/blueos/xl_2019_X.cmake \
+  -C ../host-configs/lc-builds/blueos/xl_X.cmake \
   -DENABLE_OPENMP=On \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
diff --git a/scripts/lc-builds/blueos_xl-2019.12.23_omptarget.sh b/scripts/lc-builds/blueos_xl-2019.12.23_omptarget.sh
index bfc061f37f..e8f411b247 100755
--- a/scripts/lc-builds/blueos_xl-2019.12.23_omptarget.sh
+++ b/scripts/lc-builds/blueos_xl-2019.12.23_omptarget.sh
@@ -17,7 +17,7 @@ module load cmake/3.14.5
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2019.12.23/bin/xlc++_r \
-  -C ../host-configs/lc-builds/blueos/xl_2019_X.cmake \
+  -C ../host-configs/lc-builds/blueos/xl_X.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_TARGET_OPENMP=On \
   -DOpenMP_CXX_FLAGS="-qoffload;-qsmp=omp;-qnoeh;-qalias=noansi" \
diff --git a/scripts/lc-builds/blueos_xl-2019.04.19.sh b/scripts/lc-builds/blueos_xl-2020.03.18.sh
similarity index 80%
rename from scripts/lc-builds/blueos_xl-2019.04.19.sh
rename to scripts/lc-builds/blueos_xl-2020.03.18.sh
index ad2bedd5dd..1c1af33504 100755
--- a/scripts/lc-builds/blueos_xl-2019.04.19.sh
+++ b/scripts/lc-builds/blueos_xl-2020.03.18.sh
@@ -7,7 +7,7 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-BUILD_SUFFIX=lc_blueos-xl_2019.04.19
+BUILD_SUFFIX=lc_blueos-xl_2020.03.18
 
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
@@ -16,8 +16,8 @@ module load cmake/3.14.5
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
-  -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2019.04.19/bin/xlc++_r \
-  -C ../host-configs/lc-builds/blueos/xl_2019_X.cmake \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2020.03.18/bin/xlc++_r \
+  -C ../host-configs/lc-builds/blueos/xl_X.cmake \
   -DENABLE_OPENMP=On \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
diff --git a/scripts/lc-builds/blueos_xl-2019.04.19_omptarget.sh b/scripts/lc-builds/blueos_xl-2020.03.18_omptarget.sh
similarity index 81%
rename from scripts/lc-builds/blueos_xl-2019.04.19_omptarget.sh
rename to scripts/lc-builds/blueos_xl-2020.03.18_omptarget.sh
index 3d74d91741..0f5cc337ca 100755
--- a/scripts/lc-builds/blueos_xl-2019.04.19_omptarget.sh
+++ b/scripts/lc-builds/blueos_xl-2020.03.18_omptarget.sh
@@ -7,7 +7,7 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-BUILD_SUFFIX=lc_blueos-xl_2019.04.19_omptarget
+BUILD_SUFFIX=lc_blueos-xl_2020.03.18_omptarget
 
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
@@ -16,8 +16,8 @@ module load cmake/3.14.5
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
-  -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2019.04.19/bin/xlc++_r \
-  -C ../host-configs/lc-builds/blueos/xl_2019_X.cmake \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2020.03.18/bin/xlc++_r \
+  -C ../host-configs/lc-builds/blueos/xl_X.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_TARGET_OPENMP=On \
   -DOpenMP_CXX_FLAGS="-qoffload;-qsmp=omp;-qnoeh;-qalias=noansi" \
diff --git a/scripts/lc-builds/blueos_xl-2019.06.12.sh b/scripts/lc-builds/blueos_xl-2020.06.25.sh
similarity index 80%
rename from scripts/lc-builds/blueos_xl-2019.06.12.sh
rename to scripts/lc-builds/blueos_xl-2020.06.25.sh
index 929e95a37b..89092e3d42 100755
--- a/scripts/lc-builds/blueos_xl-2019.06.12.sh
+++ b/scripts/lc-builds/blueos_xl-2020.06.25.sh
@@ -7,7 +7,7 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-BUILD_SUFFIX=lc_blueos-xl_2019.06.12
+BUILD_SUFFIX=lc_blueos-xl_2020.06.25
 
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
@@ -16,8 +16,8 @@ module load cmake/3.14.5
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
-  -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2019.06.12/bin/xlc++_r \
-  -C ../host-configs/lc-builds/blueos/xl_2019_X.cmake \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2020.06.25/bin/xlc++_r \
+  -C ../host-configs/lc-builds/blueos/xl_X.cmake \
   -DENABLE_OPENMP=On \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
diff --git a/scripts/lc-builds/blueos_xl-2019.06.12_omptarget.sh b/scripts/lc-builds/blueos_xl-2020.06.25_omptarget.sh
similarity index 81%
rename from scripts/lc-builds/blueos_xl-2019.06.12_omptarget.sh
rename to scripts/lc-builds/blueos_xl-2020.06.25_omptarget.sh
index 07a9ca4f43..cda69b83b5 100755
--- a/scripts/lc-builds/blueos_xl-2019.06.12_omptarget.sh
+++ b/scripts/lc-builds/blueos_xl-2020.06.25_omptarget.sh
@@ -7,7 +7,7 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-BUILD_SUFFIX=lc_blueos-xl_2019.06.12_omptarget
+BUILD_SUFFIX=lc_blueos-xl_2020.06.25_omptarget
 
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
@@ -16,8 +16,8 @@ module load cmake/3.14.5
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
-  -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2019.06.12/bin/xlc++_r \
-  -C ../host-configs/lc-builds/blueos/xl_2019_X.cmake \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2020.06.25/bin/xlc++_r \
+  -C ../host-configs/lc-builds/blueos/xl_X.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_TARGET_OPENMP=On \
   -DOpenMP_CXX_FLAGS="-qoffload;-qsmp=omp;-qnoeh;-qalias=noansi" \
diff --git a/scripts/lc-builds/toss3_clang10.0.0.sh b/scripts/lc-builds/toss3_clang10.0.0.sh
new file mode 100755
index 0000000000..a1ae77009f
--- /dev/null
+++ b/scripts/lc-builds/toss3_clang10.0.0.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+BUILD_SUFFIX=lc_toss3-clang-10.0.0
+
+rm -rf build_${BUILD_SUFFIX} 2>/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.14.5
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-10.0.0/bin/clang++ \
+  -C ../host-configs/lc-builds/toss3/clang_X.cmake \
+  -DENABLE_OPENMP=On \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  .. 
diff --git a/scripts/lc-builds/toss3_hipcc3.6.0.sh b/scripts/lc-builds/toss3_hipcc3.6.0.sh
new file mode 100755
index 0000000000..130b2642be
--- /dev/null
+++ b/scripts/lc-builds/toss3_hipcc3.6.0.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+BUILD_SUFFIX=lc_toss3-hipcc-3.6.0
+
+rm -rf build_${BUILD_SUFFIX} >/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+
+module load cmake/3.14.5
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DHIP_ROOT_DIR="/opt/rocm-3.6.0/hip" \
+  -DHIP_CLANG_PATH=/opt/rocm-3.6.0/llvm/bin \
+  -DCMAKE_C_COMPILER=/opt/rocm-3.6.0/llvm/bin/clang \
+  -DCMAKE_CXX_COMPILER=/opt/rocm-3.6.0/llvm/bin/clang++ \
+  -C ../host-configs/lc-builds/toss3/hip.cmake \
+  -DENABLE_HIP=ON \
+  -DENABLE_OPENMP=OFF \
+  -DENABLE_CUDA=OFF \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
diff --git a/scripts/lc-builds/toss3_icpc19.1.0.sh b/scripts/lc-builds/toss3_icpc19.1.0.sh
index a3aa0f62a0..dca38b8f59 100755
--- a/scripts/lc-builds/toss3_icpc19.1.0.sh
+++ b/scripts/lc-builds/toss3_icpc19.1.0.sh
@@ -7,6 +7,11 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
+##
+# CMake option -DENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile 
+# times at a potential cost of slower 'forall' execution.
+##
+
 BUILD_SUFFIX=lc_toss3-icpc-19.1.0
 
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
@@ -18,7 +23,9 @@ cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel/intel-19.1.0/bin/icpc \
   -C ../host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake \
+  -DENABLE_FORCEINLINE_RECURSIVE=Off \
   -DENABLE_OPENMP=On \
+  -DENABLE_TBB=On \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
   ..
diff --git a/scripts/lc-builds/toss3_pgi20.1.sh b/scripts/lc-builds/toss3_pgi20.1.sh
new file mode 100755
index 0000000000..2c5112039b
--- /dev/null
+++ b/scripts/lc-builds/toss3_pgi20.1.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+BUILD_SUFFIX=lc_toss3-pgi-20.1
+
+rm -rf build_${BUILD_SUFFIX} 2>/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.14.5
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/pgi/pgi-20.1/bin/pgc++ \
+  -C ../host-configs/lc-builds/toss3/pgi_X.cmake \
+  -DENABLE_OPENMP=On \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
diff --git a/scripts/macos-builds/apple_clang10.0.0.sh b/scripts/macos-builds/apple_clang10.0.0.sh
new file mode 100755
index 0000000000..6eb95c457b
--- /dev/null
+++ b/scripts/macos-builds/apple_clang10.0.0.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+BUILD_SUFFIX=macos_apple-clang
+
+rm -rf build_${BUILD_SUFFIX} 2>/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/bin/clang++ \
+  -C ../host-configs/macos-builds/clang_X.cmake \
+  -DENABLE_OPENMP=Off \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  .. 
diff --git a/scripts/uberenv/LICENSE b/scripts/uberenv/LICENSE
new file mode 100644
index 0000000000..fcd00312e7
--- /dev/null
+++ b/scripts/uberenv/LICENSE
@@ -0,0 +1,64 @@
+Copyright (c) 2014-2018, Lawrence Livermore National Security, LLC.
+
+Produced at the Lawrence Livermore National Laboratory
+
+LLNL-CODE-666778
+
+All rights reserved.
+
+This file is part of Conduit. 
+
+For details, see: http://software.llnl.gov/conduit/.
+
+Please also read conduit/LICENSE
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, 
+  this list of conditions and the disclaimer below.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the disclaimer (as noted below) in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of the LLNS/LLNL nor the names of its contributors may
+  be used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
+LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
+DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+POSSIBILITY OF SUCH DAMAGE.
+
+Additional BSD Notice
+
+ 1. This notice is required to be provided under our contract with the U.S.
+    Department of Energy (DOE). This work was produced at Lawrence 
+    Livermore National Laboratory under Contract No. DE-AC52-07NA27344 with
+    the DOE.
+ 
+ 2. Neither the United States Government nor Lawrence Livermore National
+    Security, LLC nor any of their employees, makes any warranty, express
+    or implied, or assumes any liability or responsibility for the 
+    accuracy, completeness, or usefulness of any information, apparatus, 
+    product, or process disclosed, or represents that its use would not
+    infringe privately-owned rights.
+
+ 3. Also, reference herein to any specific commercial products, process,
+    or services by trade name, trademark, manufacturer or otherwise does
+    not necessarily constitute or imply its endorsement, recommendation,
+    or favoring by the United States Government or Lawrence Livermore 
+    National Security, LLC. The views and opinions of authors expressed
+    herein do not necessarily state or reflect those of the United 
+    States Government or Lawrence Livermore National Security, LLC, and
+    shall not be used for advertising or product endorsement purposes.
+
diff --git a/scripts/uberenv/Makefile b/scripts/uberenv/Makefile
new file mode 100644
index 0000000000..2760762d10
--- /dev/null
+++ b/scripts/uberenv/Makefile
@@ -0,0 +1,6 @@
+
+default:
+	sphinx-build  -E -a -b html docs/sphinx/ _docs_html
+
+clean:
+	rm -rf _docs_html
\ No newline at end of file
diff --git a/scripts/uberenv/README.md b/scripts/uberenv/README.md
new file mode 100644
index 0000000000..82d682017e
--- /dev/null
+++ b/scripts/uberenv/README.md
@@ -0,0 +1,19 @@
+# uberenv
+Automates using Spack (https://www.spack.io/) to build and deploy software.
+
+Uberenv is a short python script that helps automate using Spack to build
+third-party dependencies for development and to deploy Spack packages. 
+
+Uberenv was released as part of the Conduit (https://github.com/LLNL/conduit/). It is included in-source in several projects, this repo is used to hold the latest reference version.
+
+For more details, see Uberenv's documention:
+
+https://uberenv.readthedocs.io
+
+You can also find details about how it is used in Conduit's documentation:
+
+https://llnl-conduit.readthedocs.io/en/latest/building.html#building-conduit-and-third-party-dependencies
+
+Conduit's source repo also serves as an example for uberenv and spack configuration files, etc:
+
+https://github.com/LLNL/conduit/tree/master/scripts/uberenv
diff --git a/scripts/uberenv/docs/sphinx/conf.py b/scripts/uberenv/docs/sphinx/conf.py
new file mode 100644
index 0000000000..a8475c7b85
--- /dev/null
+++ b/scripts/uberenv/docs/sphinx/conf.py
@@ -0,0 +1,324 @@
+# -*- coding: utf-8 -*-
+#
+###############################################################################
+# Copyright (c) 2015-2019, Lawrence Livermore National Security, LLC.
+#
+# Produced at the Lawrence Livermore National Laboratory
+#
+# LLNL-CODE-666778
+#
+# All rights reserved.
+#
+# This file is part of Conduit.
+#
+# For details, see: http://software.llnl.gov/conduit/.
+#
+# Please also read conduit/LICENSE
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the disclaimer below.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the disclaimer (as noted below) in the
+#   documentation and/or other materials provided with the distribution.
+#
+# * Neither the name of the LLNS/LLNL nor the names of its contributors may
+#   be used to endorse or promote products derived from this software without
+#   specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
+# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+###############################################################################
+#
+# Uberenv documentation build configuration file, created by
+# sphinx-quickstart on Thu Oct 16 11:23:46 2014.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.doctest',
+    'sphinx.ext.todo',
+    'sphinx.ext.coverage',
+    'sphinx.ext.mathjax'
+]
+
+# try to add the breathe extension
+try:
+    import breathe
+    extensions.append('breathe')
+except:
+    pass
+
+# Add any paths that contain templates here, relative to this directory.
+# templates_path = ['@CMAKE_CURRENT_SOURCE_DIR@/_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'Uberenv'
+copyright = u'Copyright (c) 2015-2019, LLNS'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = 'current'
+# The full version, including alpha/beta/rc tags.
+release = 'current'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'sphinx_rtd_theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+html_theme_options = { 'logo_only' : True }
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = 
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+# html_static_path = ['@CMAKE_CURRENT_SOURCE_DIR@/_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Uberenvdoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  ('index', 'Uberenv.tex', u'Uberenv Documentation',
+   u'LLNS', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'uberenv', u'Uberenv Documentation',
+     [u'LLNS'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  ('index', 'Uberenv', u'Uberenv Documentation',
+   u'LLNS', 'Uberenv', 'Automates using spack to build and deploy software.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
+
+
+# try to use the read the docs theme
+try:
+    import sphinx_rtd_theme
+    html_theme = "sphinx_rtd_theme"
+    html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+except:
+    pass
diff --git a/scripts/uberenv/docs/sphinx/index.rst b/scripts/uberenv/docs/sphinx/index.rst
new file mode 100644
index 0000000000..457ec596d3
--- /dev/null
+++ b/scripts/uberenv/docs/sphinx/index.rst
@@ -0,0 +1,194 @@
+.. ############################################################################
+.. # Copyright (c) 2014-2018, Lawrence Livermore National Security, LLC.
+.. #
+.. # Produced at the Lawrence Livermore National Laboratory
+.. #
+.. # LLNL-CODE-666778
+.. #
+.. # All rights reserved.
+.. #
+.. # This file is part of Conduit.
+.. #
+.. # For details, see: http://software.llnl.gov/conduit/.
+.. #
+.. # Please also read conduit/LICENSE
+.. #
+.. # Redistribution and use in source and binary forms, with or without
+.. # modification, are permitted provided that the following conditions are met:
+.. #
+.. # * Redistributions of source code must retain the above copyright notice,
+.. #   this list of conditions and the disclaimer below.
+.. #
+.. # * Redistributions in binary form must reproduce the above copyright notice,
+.. #   this list of conditions and the disclaimer (as noted below) in the
+.. #   documentation and/or other materials provided with the distribution.
+.. #
+.. # * Neither the name of the LLNS/LLNL nor the names of its contributors may
+.. #   be used to endorse or promote products derived from this software without
+.. #   specific prior written permission.
+.. #
+.. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+.. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.. # ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
+.. # LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
+.. # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.. # DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.. # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.. # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+.. # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+.. # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+.. # POSSIBILITY OF SUCH DAMAGE.
+.. #
+.. ############################################################################
+
+.. _building_with_uberenv:
+
+Uberenv
+~~~~~~~~~~~~~~~
+
+**Uberenv** automates using `Spack <ttp://www.spack.io>`_ to build and deploy software.
+
+Many projects leverage `Spack <ttp://www.spack.io>`_ to help build the software dependencies needed to develop and deploy their projects on HPC systems. Uberenv is a python script that helps automate using Spack to build
+third-party dependencies for development and to deploy Spack packages.
+
+Uberenv was released as part of Conduit (https://github.com/LLNL/conduit/). It is included in-source in several projects. The
+https://github.com/llnl/uberenv/ repo is used to hold the latest reference version of Uberenv.
+
+
+uberenv.py
+~~~~~~~~~~~~~~~~~~~~~
+
+``uberenv.py`` is a single file python script that automates fetching Spack, building and installing third party dependencies, and can optionally install packages as well.  To automate the full install process, ``uberenv.py`` uses a target Spack package along with extra settings such as Spack compiler and external third party package details for common HPC platforms.
+
+``uberenv.py`` is included directly in a project's source code repo in the folder: ``scripts/uberenv/``
+This folder is also used to store extra Spack and Uberenv configuration files unique to the target project. ``uberenv.py`` uses a ``project.json`` file to specify project details, including the target Spack package name and which Spack repo is used.  Conduit's source repo serves as an example for Uberenv and Spack configuration files, etc:
+
+https://github.com/LLNL/conduit/tree/master/scripts/uberenv
+
+
+``uberenv.py`` is developed by LLNL in support of the `Ascent <http://github.com/alpine-dav/ascent/>`_, Axom, and `Conduit <https://github.com/llnl/conduit>`_  projects.
+
+
+Command Line Options
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Build configuration
+-------------------
+
+``uberenv.py`` has a few options that allow you to control how dependencies are built:
+
+ ======================= ============================================== ================================================
+  Option                  Description                                    Default
+ ======================= ============================================== ================================================
+  ``--prefix``            Destination directory                          ``uberenv_libs``
+  ``--spec``              Spack spec                                     linux: **%gcc**
+                                                                         osx: **%clang**
+  ``--spack-config-dir``  Folder with Spack settings files               linux: (empty)
+                                                                         osx: ``scripts/uberenv/spack_configs/darwin/``
+  ``-k``                  Ignore SSL Errors                              **False**
+  ``--install``           Fully install target, not just dependencies    **False**
+  ``--run_tests``         Invoke tests during build and against install  **False**
+  ``--project-json``      File for project specific settings             ``project.json``
+ ======================= ============================================== ================================================
+
+The ``-k`` option exists for sites where SSL certificate interception undermines fetching
+from github and https hosted source tarballs. When enabled, ``uberenv.py`` clones Spack using:
+
+.. code:: bash
+
+    git -c http.sslVerify=false clone https://github.com/llnl/spack.git
+
+And passes ``-k`` to any Spack commands that may fetch via https.
+
+
+Default invocation on Linux:
+
+.. code:: bash
+
+    python scripts/uberenv/uberenv.py --prefix uberenv_libs \
+                                      --spec %gcc
+
+Default invocation on OSX:
+
+.. code:: bash
+
+    python scripts/uberenv/uberenv.py --prefix uberenv_libs \
+                                      --spec %clang \
+                                      --spack-config-dir scripts/uberenv/spack_configs/darwin/
+
+
+Use the ``--install`` option to install the target package (not just its development dependencies):
+
+.. code:: bash
+
+    python scripts/uberenv/uberenv.py --install
+
+
+If the target Spack package supports Spack's testing hooks, you can run tests during the build process to validate the build and install, using the ``--run_tests`` option:
+
+.. code:: bash
+
+    python scripts/uberenv/uberenv.py --install \
+                                      --run_tests
+
+For details on Spack's spec syntax, see the `Spack Specs & dependencies <http://spack.readthedocs.io/en/latest/basic_usage.html#specs-dependencies>`_ documentation.
+
+
+Uberenv looks for configuration yaml files under ``scripts/uberenv/spack_config/{platform}`` or you can use the **--spack-config-dir** option to specify a directory with compiler and packages yaml files to use with Spack. See the `Spack Compiler Configuration <http://spack.readthedocs.io/en/latest/getting_started.html#manual-compiler-configuration>`_
+and `Spack System Packages
+<http://spack.readthedocs.io/en/latest/getting_started.html#system-packages>`_
+documentation for details.
+
+.. note::
+    The bootstrapping process ignores ``~/.spack/compilers.yaml`` to avoid conflicts
+    and surprises from a user's specific Spack settings on HPC platforms.
+
+When run, ``uberenv.py`` checkouts a specific version of Spack from github as ``spack`` in the
+destination directory. It then uses Spack to build and install the target packages' dependencies into
+``spack/opt/spack/``. Finally, the target package generates a host-config file ``{hostname}.cmake``, which is
+copied to destination directory. This file specifies the compiler settings and paths to all of the dependencies.
+
+
+Project configuration
+---------------------
+
+Part of the configuration can also be addressed using a json file. By default, it is named ``project.json`` and some settings can be overridden on command line:
+
+ ==================== ========================== ================================================ =======================================
+  Setting              Option                     Description                                      Default
+ ==================== ========================== ================================================ =======================================
+  package_name         ``--package-name``         Spack package name                               **None**
+  package_version      **None**                   Spack package version                            **None**
+  package_final_phase  ``--package-final-phase``  Controls after which phase Spack should stop     **None**
+  package_source_dir   ``--package-source-dir``   Controls the source directory Spack should use   **None**
+  spack_url            **None**                   Url where to download Spack                      ``https://github.com/spack/spack.git``
+  spack_commit         **None**                   Spack commit to checkout                         **None**
+  spack_activate       **None**                   Spack packages to activate                       **None**
+ ==================== ========================== ================================================ =======================================
+
+
+Optimization
+------------
+
+``uberenv.py`` also features options to optimize the installation
+
+ ==================== ============================================== ================================================
+  Option               Description                                    Default
+ ==================== ============================================== ================================================
+  ``--mirror``         Location of a Spack mirror                     **None**
+  ``--create-mirror``  Creates a Spack mirror at specified location   **None**
+  ``--upstream``       Location of a Spack upstream                   **None**
+ ==================== ============================================== ================================================
+
+
+Project Settings
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A few notes on using ``uberenv.py`` in a new project:
+
+* For an example of how to craft a ``project.json`` file a target project, see: `Conduit's project.json file <https://github.com/LLNL/conduit/tree/master/scripts/uberenv/project.json>`_
+
+* ``uberenv.py`` hot copies ``packages`` to the cloned Spack install, this allows you to easily version control any Spack package overrides necessary
+
+
diff --git a/scripts/uberenv/gen_spack_env_script.py b/scripts/uberenv/gen_spack_env_script.py
new file mode 100644
index 0000000000..a1e6ba5d0f
--- /dev/null
+++ b/scripts/uberenv/gen_spack_env_script.py
@@ -0,0 +1,128 @@
+###############################################################################
+# Copyright (c) 2015-2019, Lawrence Livermore National Security, LLC.
+#
+# Produced at the Lawrence Livermore National Laboratory
+#
+# LLNL-CODE-716457
+#
+# All rights reserved.
+#
+# This file is part of Ascent.
+#
+# For details, see: http://ascent.readthedocs.io/.
+#
+# Please also read ascent/LICENSE
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the disclaimer below.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the disclaimer (as noted below) in the
+#   documentation and/or other materials provided with the distribution.
+#
+# * Neither the name of the LLNS/LLNL nor the names of its contributors may
+#   be used to endorse or promote products derived from this software without
+#   specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
+# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+###############################################################################
+import os
+import sys
+import subprocess
+
+from os.path import join as pjoin
+
+# if you have bad luck with spack load, this
+# script is for you!
+#
+# Looks for subdir: spack or uberenv_libs/spack
+# queries spack for given package names and
+# creates a bash script that adds those to your path
+#
+#
+# usage:
+# python gen_spack_env_script.py [spack_pkg_1 spack_pkg_2 ...]
+#
+
+def sexe(cmd,ret_output=False,echo = True):
+    """ Helper for executing shell commands. """
+    if echo:
+        print("[exe: {}]".format(cmd))
+    if ret_output:
+        p = subprocess.Popen(cmd,
+                             shell=True,
+                             stdout=subprocess.PIPE,
+                             stderr=subprocess.STDOUT)
+        res = p.communicate()[0]
+        res = res.decode('utf8')
+        return p.returncode,res
+    else:
+        return subprocess.call(cmd,shell=True)
+
+
+def spack_exe(spath=None):
+    if spath is None:
+        to_try = [pjoin("uberenv_libs","spack"), "spack"]
+        for p in to_try:
+            abs_p = os.path.abspath(p)
+            print("[looking for spack directory at: {}]".format(abs_p))
+            if os.path.isdir(abs_p):
+                print("[FOUND spack directory at: {}]".format(abs_p))
+                return os.path.abspath(pjoin(abs_p,"bin","spack"))
+        print("[ERROR: failed to find spack directory!]")
+        sys.exit(-1)
+    else:
+        spack_exe = os.path.abspath(spath,"bin","spack")
+        if not os.path.isfile(spack_exec):
+            print("[ERROR: failed to find spack directory at spath={}]").format(spath)
+            sys.exit(-1)
+        return spack_exe
+
+def find_pkg(pkg_name):
+    r,rout = sexe(spack_exe() + " find -p " + pkg_name,ret_output = True)
+    print(rout)
+    for l in rout.split("\n"):
+        print(l)
+        lstrip = l.strip()
+        if not lstrip == "" and \
+           not lstrip.startswith("==>") and  \
+           not lstrip.startswith("--"):
+            return {"name": pkg_name, "path": l.split()[-1]}
+    print("[ERROR: failed to find package named '{}']".format(pkg_name))
+    sys.exit(-1)
+
+def path_cmd(pkg):
+    return('export PATH={}:$PATH\n'.format((pjoin(pkg["path"],"bin"))))
+
+def write_env_script(pkgs):
+    ofile = open("s_env.sh","w")
+    for p in pkgs:
+        print("[found {} at {}]".format(p["name"],p["path"]))
+        ofile.write("# {}\n".format(p["name"]))
+        ofile.write(path_cmd(p))
+    print("[created {}]".format(os.path.abspath("s_env.sh")))
+
+def main():
+    pkgs = [find_pkg(pkg) for pkg in sys.argv[1:]]
+    if len(pkgs) > 0:
+        write_env_script(pkgs)
+    else:
+        print("usage: python gen_spack_env_script.py spack_pkg_1 spack_pkg_2 ...")
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/uberenv/packages/raja/package.py b/scripts/uberenv/packages/raja/package.py
new file mode 100644
index 0000000000..3847f1262f
--- /dev/null
+++ b/scripts/uberenv/packages/raja/package.py
@@ -0,0 +1,288 @@
+# Copyright 2013-2020 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+
+from spack import *
+
+import socket
+import os
+
+from os import environ as env
+from os.path import join as pjoin
+
+def cmake_cache_entry(name, value, comment=""):
+    """Generate a string for a cmake cache variable"""
+
+    return 'set(%s "%s" CACHE PATH "%s")\n\n' % (name,value,comment)
+
+
+def cmake_cache_string(name, string, comment=""):
+    """Generate a string for a cmake cache variable"""
+
+    return 'set(%s "%s" CACHE STRING "%s")\n\n' % (name,string,comment)
+
+
+def cmake_cache_option(name, boolean_value, comment=""):
+    """Generate a string for a cmake configuration option"""
+
+    value = "ON" if boolean_value else "OFF"
+    return 'set(%s %s CACHE BOOL "%s")\n\n' % (name,value,comment)
+
+
+def get_spec_path(spec, package_name, path_replacements = {}, use_bin = False) :
+    """Extracts the prefix path for the given spack package
+       path_replacements is a dictionary with string replacements for the path.
+    """
+
+    if not use_bin:
+        path = spec[package_name].prefix
+    else:
+        path = spec[package_name].prefix.bin
+
+    path = os.path.realpath(path)
+
+    for key in path_replacements:
+        path = path.replace(key, path_replacements[key])
+
+    return path
+
+
+class Raja(CMakePackage, CudaPackage):
+    """RAJA Parallel Framework."""
+
+    homepage = "http://software.llnl.gov/RAJA/"
+    git      = "https://github.com/LLNL/RAJA.git"
+
+    version('develop', branch='develop', submodules='True')
+    version('main',  branch='main',  submodules='True')
+    version('0.11.0', tag='v0.11.0', submodules="True")
+    version('0.10.1', tag='v0.10.1', submodules="True")
+    version('0.10.0', tag='v0.10.0', submodules="True")
+    version('0.9.0', tag='v0.9.0', submodules="True")
+    version('0.8.0', tag='v0.8.0', submodules="True")
+    version('0.7.0', tag='v0.7.0', submodules="True")
+    version('0.6.0', tag='v0.6.0', submodules="True")
+    version('0.5.3', tag='v0.5.3', submodules="True")
+    version('0.5.2', tag='v0.5.2', submodules="True")
+    version('0.5.1', tag='v0.5.1', submodules="True")
+    version('0.5.0', tag='v0.5.0', submodules="True")
+    version('0.4.1', tag='v0.4.1', submodules="True")
+    version('0.4.0', tag='v0.4.0', submodules="True")
+
+    variant('openmp', default=True, description='Build OpenMP backend')
+    variant('shared', default=True, description='Build Shared Libs')
+
+    depends_on('cmake@3.8:', type='build')
+    depends_on('cmake@3.9:', when='+cuda', type='build')
+
+    phases = ['hostconfig', 'cmake', 'build',' install']
+
+    def _get_sys_type(self, spec):
+        sys_type = str(spec.architecture)
+        # if on llnl systems, we can use the SYS_TYPE
+        if "SYS_TYPE" in env:
+            sys_type = env["SYS_TYPE"]
+        return sys_type
+
+    def _get_host_config_path(self, spec):
+        var=''
+        if '+cuda' in spec:
+            var= '-'.join([var,'cuda'])
+
+        host_config_path = "%s-%s-%s%s.cmake" % (socket.gethostname().rstrip('1234567890'),
+                                               self._get_sys_type(spec),
+                                               spec.compiler,
+                                               var)
+        dest_dir = self.stage.source_path
+        host_config_path = os.path.abspath(pjoin(dest_dir, host_config_path))
+        return host_config_path
+
+    def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
+        """
+        This method creates a 'host-config' file that specifies
+        all of the options used to configure and build Umpire.
+
+        For more details about 'host-config' files see:
+            http://software.llnl.gov/conduit/building.html
+
+        Note:
+          The `py_site_pkgs_dir` arg exists to allow a package that
+          subclasses this package provide a specific site packages
+          dir when calling this function. `py_site_pkgs_dir` should
+          be an absolute path or `None`.
+
+          This is necessary because the spack `site_packages_dir`
+          var will not exist in the base class. For more details
+          on this issue see: https://github.com/spack/spack/issues/6261
+        """
+
+        #######################
+        # Compiler Info
+        #######################
+        c_compiler = env["SPACK_CC"]
+        cpp_compiler = env["SPACK_CXX"]
+
+        # Even though we don't have fortran code in our project we sometimes
+        # use the Fortran compiler to determine which libstdc++ to use
+        f_compiler = ""
+        if "SPACK_FC" in env.keys():
+            # even if this is set, it may not exist
+            # do one more sanity check
+            if os.path.isfile(env["SPACK_FC"]):
+                f_compiler = env["SPACK_FC"]
+
+        #######################################################################
+        # By directly fetching the names of the actual compilers we appear
+        # to doing something evil here, but this is necessary to create a
+        # 'host config' file that works outside of the spack install env.
+        #######################################################################
+
+        sys_type = self._get_sys_type(spec)
+
+        ##############################################
+        # Find and record what CMake is used
+        ##############################################
+
+        cmake_exe = spec['cmake'].command.path
+        cmake_exe = os.path.realpath(cmake_exe)
+
+        host_config_path = self._get_host_config_path(spec)
+        cfg = open(host_config_path, "w")
+        cfg.write("###################\n".format("#" * 60))
+        cfg.write("# Generated host-config - Edit at own risk!\n")
+        cfg.write("###################\n".format("#" * 60))
+        cfg.write("# Copyright (c) 2020, Lawrence Livermore National Security, LLC and\n")
+        cfg.write("# other Umpire Project Developers. See the top-level LICENSE file for\n")
+        cfg.write("# details.\n")
+        cfg.write("#\n")
+        cfg.write("# SPDX-License-Identifier: (BSD-3-Clause) \n")
+        cfg.write("###################\n\n".format("#" * 60))
+
+        cfg.write("#------------------\n".format("-" * 60))
+        cfg.write("# SYS_TYPE: {0}\n".format(sys_type))
+        cfg.write("# Compiler Spec: {0}\n".format(spec.compiler))
+        cfg.write("# CMake executable path: %s\n" % cmake_exe)
+        cfg.write("#------------------\n\n".format("-" * 60))
+
+        #######################
+        # Compiler Settings
+        #######################
+
+        cfg.write("#------------------\n".format("-" * 60))
+        cfg.write("# Compilers\n")
+        cfg.write("#------------------\n\n".format("-" * 60))
+        cfg.write(cmake_cache_entry("CMAKE_C_COMPILER", c_compiler))
+        cfg.write(cmake_cache_entry("CMAKE_CXX_COMPILER", cpp_compiler))
+
+        # use global spack compiler flags
+        cflags = ' '.join(spec.compiler_flags['cflags'])
+        if cflags:
+            cfg.write(cmake_cache_entry("CMAKE_C_FLAGS", cflags))
+
+        cxxflags = ' '.join(spec.compiler_flags['cxxflags'])
+        if cxxflags:
+            cfg.write(cmake_cache_entry("CMAKE_CXX_FLAGS", cxxflags))
+
+        # TODO (bernede1@llnl.gov): Is this useful for RAJA?
+        if ("gfortran" in f_compiler) and ("clang" in cpp_compiler):
+            libdir = pjoin(os.path.dirname(
+                           os.path.dirname(f_compiler)), "lib")
+            flags = ""
+            for _libpath in [libdir, libdir + "64"]:
+                if os.path.exists(_libpath):
+                    flags += " -Wl,-rpath,{0}".format(_libpath)
+            description = ("Adds a missing libstdc++ rpath")
+            if flags:
+                cfg.write(cmake_cache_string("BLT_EXE_LINKER_FLAGS", flags,
+                                            description))
+
+        if "toss_3_x86_64_ib" in sys_type:
+            release_flags = "-O3 -msse4.2 -funroll-loops -finline-functions"
+            cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELEASE", release_flags))
+            reldebinf_flags = "-O3 -g -msse4.2 -funroll-loops -finline-functions"
+            cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELWITHDEBINFO", reldebinf_flags))
+            debug_flags = "-O0 -g"
+            cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_DEBUG", debug_flags))
+
+        if "blueos_3_ppc64le_ib" in sys_type:
+            release_flags = "-O3"
+            cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELEASE", release_flags))
+            reldebinf_flags = "-O3 -g"
+            cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELWITHDEBINFO", reldebinf_flags))
+            debug_flags = "-O0 -g"
+            cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_DEBUG", debug_flags))
+
+        if "+cuda" in spec:
+            cfg.write("#------------------{0}\n".format("-" * 60))
+            cfg.write("# Cuda\n")
+            cfg.write("#------------------{0}\n\n".format("-" * 60))
+
+            cfg.write(cmake_cache_option("ENABLE_CUDA", True))
+
+            cudatoolkitdir = spec['cuda'].prefix
+            cfg.write(cmake_cache_entry("CUDA_TOOLKIT_ROOT_DIR",
+                                        cudatoolkitdir))
+            cudacompiler = "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc"
+            cfg.write(cmake_cache_entry("CMAKE_CUDA_COMPILER",
+                                        cudacompiler))
+
+            if not spec.satisfies('cuda_arch=none'):
+                cuda_arch = spec.variants['cuda_arch'].value
+                flag = '-arch sm_{0}'.format(cuda_arch[0])
+                cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS", flag))
+
+            if "blueos_3_ppc64le_ib" in sys_type:
+                host_opt_flags = "-Xcompiler -O3 -Xcompiler -fopenmp"
+
+                release_flags = "-O3 {0}".format(host_opt_flags)
+                cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELEASE", release_flags))
+                reldebinf_flags = "-O3 -g -lineinfo {0}".format(host_opt_flags)
+                cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELWITHDEBINFO", reldebinf_flags))
+                debug_flags = "-O0 -g -G"
+                cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_DEBUG", debug_flags))
+
+        else:
+            cfg.write(cmake_cache_option("ENABLE_CUDA", False))
+
+
+        cfg.write("#------------------{0}\n".format("-" * 60))
+        cfg.write("# Other\n")
+        cfg.write("#------------------{0}\n\n".format("-" * 60))
+
+        cfg.write(cmake_cache_string("RAJA_RANGE_ALIGN", "4"))
+        cfg.write(cmake_cache_string("RAJA_RANGE_MIN_LENGTH", "32"))
+        cfg.write(cmake_cache_string("RAJA_DATA_ALIGN", "64"))
+
+        cfg.write(cmake_cache_option("RAJA_HOST_CONFIG_LOADED", True))
+
+
+    def cmake_args(self):
+        spec = self.spec
+        host_config_path = self._get_host_config_path(spec)
+
+        options = []
+        options.extend(['-C', host_config_path])
+
+        # shared vs static libs
+        if "+shared" in spec:
+            options.append('-DBUILD_SHARED_LIBS=ON')
+        else:
+            options.append('-DBUILD_SHARED_LIBS=OFF')
+
+        # OpenMP
+        if "+openmp" in spec:
+            options.append('-DENABLE_OPENMP=ON')
+        else:
+            options.append('-DENABLE_OPENMP=ON')
+
+        # Work around spack adding -march=ppc64le to SPACK_TARGET_ARGS which
+        # is used by the spack compiler wrapper.  This can go away when BLT
+        # removes -Werror from GTest flags
+        if self.spec.satisfies('%clang target=ppc64le:') or not self.run_tests:
+            options.append('-DENABLE_TESTS=OFF')
+        else:
+            options.append('-DENABLE_TESTS=ON')
+
+        return options
diff --git a/scripts/uberenv/project.json b/scripts/uberenv/project.json
new file mode 100644
index 0000000000..c27e1170ed
--- /dev/null
+++ b/scripts/uberenv/project.json
@@ -0,0 +1,9 @@
+{
+"package_name" : "raja",
+"package_version" : "develop",
+"package_final_phase" : "hostconfig",
+"package_source_dir" : "../..",
+"spack_url": "https://github.com/spack/spack",
+"spack_commit": "bc53bb9b7cbf250c98cfe77d334ed30d6b958c21",
+"spack_activate" : {}
+}
diff --git a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib/compilers.yaml b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib/compilers.yaml
new file mode 100644
index 0000000000..4239f7556c
--- /dev/null
+++ b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib/compilers.yaml
@@ -0,0 +1,157 @@
+compilers:
+- compiler:
+    spec: clang@3.9.1
+    paths:
+      cc: /usr/tcetmp/packages/clang/clang-3.9.1/bin/clang
+      cxx: /usr/tcetmp/packages/clang/clang-3.9.1/bin/clang++
+      f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r
+      fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r
+    flags: {}
+    operating_system: rhel7
+    target: ppc64le
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@4.0.0
+    paths:
+      cc: /usr/tcetmp/packages/clang/clang-4.0.0/bin/clang
+      cxx: /usr/tcetmp/packages/clang/clang-4.0.0/bin/clang++
+      f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r
+      fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r
+    flags: {}
+    operating_system: rhel7
+    target: ppc64le
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@8.0.1
+    paths:
+      cc: /usr/tce/packages/clang/clang-8.0.1/bin/clang
+      cxx: /usr/tce/packages/clang/clang-8.0.1/bin/clang++
+      f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r
+      fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r
+    flags: {}
+    operating_system: rhel7
+    target: ppc64le
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@9.0.0
+    paths:
+      cc: /usr/tce/packages/clang/clang-9.0.0/bin/clang
+      cxx: /usr/tce/packages/clang/clang-9.0.0/bin/clang++
+      f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r
+      fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r
+    flags: {}
+    operating_system: rhel7
+    target: ppc64le
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@coral2018.08.08
+    paths:
+      cc: /usr/tce/packages/clang/clang-coral-2018.08.08/bin/clang
+      cxx: /usr/tce/packages/clang/clang-coral-2018.08.08/bin/clang++
+      f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r
+      fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r
+    flags: {}
+    operating_system: rhel7
+    target: ppc64le
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@default
+    paths:
+      cc: clang
+      cxx: clang++
+      f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r
+      fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r
+    flags: {}
+    operating_system: rhel7
+    target: ppc64le
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: gcc@8.3.1
+    paths:
+      cc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-8.3.1/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+    flags: {}
+    operating_system: rhel7
+    target: ppc64le
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: gcc@4.9.3
+    paths:
+      cc: /usr/tce/packages/gcc/gcc-4.9.3/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-4.9.3/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran
+    flags: {}
+    operating_system: rhel7
+    target: ppc64le
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: gcc@default
+    paths:
+      cc: gcc
+      cxx: g++
+      f77: gfortran
+      fc: gfortran
+    flags: {}
+    operating_system: rhel7
+    target: ppc64le
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: xl@default
+    paths:
+      cc: xlc
+      cxx: xlc++
+      f77: xlf2003
+      fc: xlf2003
+    flags: {}
+    operating_system: rhel7
+    target: ppc64le
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: xl@beta2019.06.20
+    paths:
+      cc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlc
+      cxx: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlc++
+      f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r
+      fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r
+    flags: {}
+    operating_system: rhel7
+    target: ppc64le
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: pgi@default
+    paths:
+      cc: pgcc
+      cxx: pgc++
+      f77: pgfortran
+      fc: pgfortran
+    flags: {}
+    operating_system: rhel7
+    target: ppc64le
+    modules: []
+    environment: {}
+    extra_rpaths: []
diff --git a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib/packages.yaml b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib/packages.yaml
new file mode 100644
index 0000000000..35384eaee5
--- /dev/null
+++ b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib/packages.yaml
@@ -0,0 +1,21 @@
+packages:
+  all:
+    # This defaults us to machine specific flags of ivybridge which allows
+    # us to run on broadwell as well
+    target: [ppc64le]
+    compiler: [gcc, pgi, clang, xl]
+
+  cmake:
+    version: [3.14.5]
+    paths:
+      cmake: /usr/tce/packages/cmake/cmake-3.14.5
+    buildable: False
+
+  cuda:
+    version: [10.1.243,10.1.168,9.2.148,8.0]
+    paths:
+      cuda@10.1.243: /usr/tce/packages/cuda/cuda-10.1.243
+      cuda@10.1.168: /usr/tce/packages/cuda/cuda-10.1.168
+      cuda@9.2.148: /usr/tce/packages/cuda/cuda-9.2.148
+      cuda@8.0: /usr/tce/packages/cuda/cuda-8.0
+    buildable: False
diff --git a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9 b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9
new file mode 120000
index 0000000000..f06fef9d53
--- /dev/null
+++ b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9
@@ -0,0 +1 @@
+blueos_3_ppc64le_ib
\ No newline at end of file
diff --git a/scripts/uberenv/spack_configs/config.yaml b/scripts/uberenv/spack_configs/config.yaml
new file mode 100644
index 0000000000..2095112ff6
--- /dev/null
+++ b/scripts/uberenv/spack_configs/config.yaml
@@ -0,0 +1,80 @@
+# -------------------------------------------------------------------------
+# This is the default spack configuration file.
+#
+# Settings here are versioned with Spack and are intended to provide
+# sensible defaults out of the box. Spack maintainers should edit this
+# file to keep it current.
+#
+# Users can override these settings by editing the following files.
+#
+# Per-spack-instance settings (overrides defaults):
+#   $SPACK_ROOT/etc/spack/config.yaml
+#
+# Per-user settings (overrides default and site settings):
+#   ~/.spack/config.yaml
+# -------------------------------------------------------------------------
+config:
+  # This is the path to the root of the Spack install tree.
+  # You can use $spack here to refer to the root of the spack instance.
+  install_tree: $spack/..
+
+  # install directory layout
+  install_path_scheme: "${COMPILERNAME}-${COMPILERVER}/${PACKAGE}-${VERSION}"
+
+# Locations where templates should be found
+  template_dirs:
+    - $spack/templates
+
+  # Locations where different types of modules should be installed.
+  module_roots:
+    tcl:    $spack/share/spack/modules
+    lmod:   $spack/share/spack/lmod
+
+
+  # Temporary locations Spack can try to use for builds.
+  #
+  # Spack will use the first one it finds that exists and is writable.
+  # You can use $tempdir to refer to the system default temp directory
+  # (as returned by tempfile.gettempdir()).
+  #
+  # A value of $spack/var/spack/stage indicates that Spack should run
+  # builds directly inside its install directory without staging them in
+  # temporary space.
+  #
+  # The build stage can be purged with `spack purge --stage`.
+  build_stage:
+    # skipping tempdir b/c running mpi tests fails with local fs
+    # - $tempdir
+    - $spack/../builds
+
+
+  # Cache directory already downloaded source tarballs and archived
+  # repositories. This can be purged with `spack purge --downloads`.
+  source_cache: $spack/var/spack/cache
+
+
+  # Cache directory for miscellaneous files, like the package index.
+  # This can be purged with `spack purge --misc-cache`
+  misc_cache: .spack/misccache
+
+
+  # If this is false, tools like curl that use SSL will not verify
+  # certifiates. (e.g., curl will use use the -k option)
+  verify_ssl: true
+
+
+  # If set to true, Spack will always check checksums after downloading
+  # archives. If false, Spack skips the checksum step.
+  checksum: true
+
+
+  # If set to true, `spack install` and friends will NOT clean
+  # potentially harmful variables from the build environment. Use wisely.
+  dirty: false
+
+
+  # The default number of jobs to use when running `make` in parallel.
+  # If set to 4, for example, `spack install` will run `make -j4`.
+  # If not set, all available cores are used by default.
+  # for uberenv, limit build_jobs to 8
+  build_jobs: 8
diff --git a/scripts/uberenv/spack_configs/darwin/compilers.yaml b/scripts/uberenv/spack_configs/darwin/compilers.yaml
new file mode 100644
index 0000000000..ed5cbf0204
--- /dev/null
+++ b/scripts/uberenv/spack_configs/darwin/compilers.yaml
@@ -0,0 +1,65 @@
+compilers:
+- compiler:
+    environment: {}
+    extra_rpaths: []
+    flags: {}
+    modules: []
+    operating_system: elcapitan
+    paths:
+      cc: /usr/bin/clang
+      cxx: /usr/bin/clang++
+      f77: /usr/local/bin/gfortran
+      fc: /usr/local/bin/gfortran
+    spec: clang@7.3.0-apple
+- compiler:
+    environment: {}
+    extra_rpaths: []
+    flags: {}
+    modules: []
+    operating_system: sierra
+    paths:
+      cc: /usr/bin/clang
+      cxx: /usr/bin/clang++
+      f77: /usr/local/bin/gfortran
+      fc: /usr/local/bin/gfortran
+    spec: clang@8.0.0-apple
+    target: x86_64
+- compiler:
+    environment: {}
+    extra_rpaths: []
+    flags: {}
+    modules: []
+    operating_system: highsierra
+    paths:
+      cc: /usr/bin/clang
+      cxx: /usr/bin/clang++
+      f77: /usr/local/bin/gfortran
+      fc: /usr/local/bin/gfortran
+    spec: clang@9.0.0-apple
+    target: x86_64
+- compiler:
+    environment: {}
+    extra_rpaths: []
+    flags: {}
+    modules: []
+    operating_system: mojave
+    paths:
+      cc: /usr/bin/clang
+      cxx: /usr/bin/clang++
+      f77: /usr/local/bin/gfortran
+      fc: /usr/local/bin/gfortran
+    spec: clang@10.0.0-apple
+    target: x86_64
+- compiler:
+    environment: {}
+    extra_rpaths: []
+    flags: {}
+    modules: []
+    operating_system: mojave
+    paths:
+      cc: /usr/local/opt/llvm/bin/clang
+      cxx: /usr/local/opt/llvm/bin/clang++
+      f77: /usr/local/bin/gfortran
+      fc: /usr/local/bin/gfortran
+    spec: clang@10.0.0
+    target: x86_64
diff --git a/scripts/uberenv/spack_configs/darwin/packages.yaml b/scripts/uberenv/spack_configs/darwin/packages.yaml
new file mode 100644
index 0000000000..6e965957cd
--- /dev/null
+++ b/scripts/uberenv/spack_configs/darwin/packages.yaml
@@ -0,0 +1,25 @@
+
+# -------------------------------------------------------------------------
+# This file controls default concretization preferences for Spack.
+#
+# Settings here are versioned with Spack and are intended to provide
+# sensible defaults out of the box. Spack maintainers should edit this
+# file to keep it current.
+#
+# Users can override these settings by editing the following files.
+#
+# Per-spack-instance settings (overrides defaults):
+#   $SPACK_ROOT/etc/spack/packages.yaml
+#
+# Per-user settings (overrides default and site settings):
+#   ~/.spack/packages.yaml
+# -------------------------------------------------------------------------
+packages:
+  all:
+    compiler: [clang]
+
+#  cmake:
+#    version: [3.17.2]
+#    paths:
+#      cmake@3.17.2: /usr/local/Cellar/cmake/3.17.2
+#    buildable: false
diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml
new file mode 100644
index 0000000000..27b86198ad
--- /dev/null
+++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml
@@ -0,0 +1,217 @@
+compilers:
+- compiler:
+    spec: clang@3.9.1
+    paths:
+      cc: /usr/tce/packages/clang/clang-3.9.1/bin/clang
+      cxx: /usr/tce/packages/clang/clang-3.9.1/bin/clang++
+      f77: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran
+    flags: {}
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@4.0.0
+    paths:
+      cc: /usr/tce/packages/clang/clang-4.0.0/bin/clang
+      cxx: /usr/tce/packages/clang/clang-4.0.0/bin/clang++
+      f77: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran
+    flags: {}
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@6.0.0
+    paths:
+      cc: /usr/tce/packages/clang/clang-6.0.0/bin/clang
+      cxx: /usr/tce/packages/clang/clang-6.0.0/bin/clang++
+      f77:
+      fc:
+    flags: {}
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@9.0.0
+    paths:
+      cc: /usr/tce/packages/clang/clang-9.0.0/bin/clang
+      cxx: /usr/tce/packages/clang/clang-9.0.0/bin/clang++
+      f77:
+      fc:
+    flags: {}
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: gcc@4.9.3
+    paths:
+      cc: /usr/tce/packages/gcc/gcc-4.9.3/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-4.9.3/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran
+    flags: {}
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: intel@16.0.4
+    paths:
+      cc: /usr/tce/packages/intel/intel-16.0.4/bin/icc
+      cxx: /usr/tce/packages/intel/intel-16.0.4/bin/icpc
+      f77: /usr/tce/packages/intel/intel-16.0.4/bin/ifort
+      fc: /usr/tce/packages/intel/intel-16.0.4/bin/ifort
+    flags:
+      cflags: -gcc-name=/usr/tce/packages/gcc/gcc-4.9.3/bin/gcc
+      cxxflags: -gcc-name=/usr/tce/packages/gcc/gcc-4.9.3/bin/g++
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: intel@17.0.2
+    paths:
+      cc: /usr/tce/packages/intel/intel-17.0.2/bin/icc
+      cxx: /usr/tce/packages/intel/intel-17.0.2/bin/icpc
+      f77: /usr/tce/packages/intel/intel-17.0.2/bin/ifort
+      fc: /usr/tce/packages/intel/intel-17.0.2/bin/ifort
+    flags:
+      cflags: -gcc-name=/usr/tce/packages/gcc/gcc-4.9.3/bin/gcc
+      cxxflags: -gcc-name=/usr/tce/packages/gcc/gcc-4.9.3/bin/g++
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: intel@18.0.0
+    paths:
+      cc: /usr/tce/packages/intel/intel-18.0.0/bin/icc
+      cxx: /usr/tce/packages/intel/intel-18.0.0/bin/icpc
+      f77: /usr/tce/packages/intel/intel-18.0.0/bin/ifort
+      fc: /usr/tce/packages/intel/intel-18.0.0/bin/ifort
+    flags:
+      cflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.1.0/bin/gcc
+      cxxflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.1.0/bin/g++
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: intel@19.0.4
+    paths:
+      cc: /usr/tce/packages/intel/intel-19.0.4/bin/icc
+      cxx: /usr/tce/packages/intel/intel-19.0.4/bin/icpc
+      f77: /usr/tce/packages/intel/intel-19.0.4/bin/ifort
+      fc: /usr/tce/packages/intel/intel-19.0.4/bin/ifort
+    flags:
+      cflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.1.0/bin/gcc
+      cxxflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.1.0/bin/g++
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: pgi@17.10
+    paths:
+      cc: /usr/tce/packages/pgi/pgi-17.10/bin/pgcc
+      cxx: /usr/tce/packages/pgi/pgi-17.10/bin/pgc++
+      f77: /usr/tce/packages/pgi/pgi-17.10/bin/pgf77
+      fc: /usr/tce/packages/pgi/pgi-17.10/bin/pgf95
+    flags: {}
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: pgi@18.5
+    paths:
+      cc: /usr/tce/packages/pgi/pgi-18.5/bin/pgcc
+      cxx: /usr/tce/packages/pgi/pgi-18.5/bin/pgc++
+      f77: /usr/tce/packages/pgi/pgi-18.5/bin/pgf77
+      fc: /usr/tce/packages/pgi/pgi-18.5/bin/pgf95
+    flags: {}
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: pgi@19.4
+    paths:
+      cc: /usr/tce/packages/pgi/pgi-19.4/bin/pgcc
+      cxx: /usr/tce/packages/pgi/pgi-19.4/bin/pgc++
+      f77: /usr/tce/packages/pgi/pgi-19.4/bin/pgfortran
+      fc: /usr/tce/packages/pgi/pgi-19.4/bin/pgfortran
+    flags: {}
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: gcc@6.1.0
+    paths:
+      cc: /usr/tce/packages/gcc/gcc-6.1.0/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-6.1.0/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-6.1.0/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-6.1.0/bin/gfortran
+    flags: {}
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: gcc@7.1.0
+    paths:
+      cc: /usr/tce/packages/gcc/gcc-7.1.0/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-7.1.0/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-7.1.0/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-7.1.0/bin/gfortran
+    flags: {}
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: gcc@7.3.0
+    paths:
+      cc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-7.3.0/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran
+    flags: {}
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: gcc@8.1.0
+    paths:
+      cc: /usr/tce/packages/gcc/gcc-8.1.0/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-8.1.0/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-8.1.0/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-8.1.0/bin/gfortran
+    flags: {}
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/packages.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib/packages.yaml
new file mode 100644
index 0000000000..c67d70f1cb
--- /dev/null
+++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib/packages.yaml
@@ -0,0 +1,18 @@
+packages:
+  all:
+    # This defaults us to machine specific flags of ivybridge which allows
+    # us to run on broadwell as well
+    target: [ivybridge]
+    compiler: [gcc, intel, pgi, clang]
+
+  cmake:
+    version: [3.14.5]
+    paths:
+      cmake: /usr/tce/packages/cmake/cmake-3.14.5
+    buildable: False
+
+  cuda:
+    version: [10.1.168]
+    paths:
+      cuda@10.1.168: /usr/tce/packages/cuda/cuda-10.1.168
+    buildable: False
diff --git a/scripts/uberenv/uberenv.py b/scripts/uberenv/uberenv.py
new file mode 100755
index 0000000000..abd92fb9ea
--- /dev/null
+++ b/scripts/uberenv/uberenv.py
@@ -0,0 +1,791 @@
+#!/bin/sh
+"exec" "python" "-u" "-B" "$0" "$@"
+###############################################################################
+# Copyright (c) 2014-2020, Lawrence Livermore National Security, LLC.
+#
+# Produced at the Lawrence Livermore National Laboratory
+#
+# LLNL-CODE-666778
+#
+# All rights reserved.
+#
+# This file is part of Conduit.
+#
+# For details, see https://lc.llnl.gov/conduit/.
+#
+# Please also read conduit/LICENSE
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the disclaimer below.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the disclaimer (as noted below) in the
+#   documentation and/or other materials provided with the distribution.
+#
+# * Neither the name of the LLNS/LLNL nor the names of its contributors may
+#   be used to endorse or promote products derived from this software without
+#   specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
+# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+###############################################################################
+
+"""
+ file: uberenv.py
+
+ description: automates using spack to install a project.
+
+"""
+
+import os
+import sys
+import subprocess
+import shutil
+import socket
+import platform
+import json
+import datetime
+import glob
+import re
+
+from optparse import OptionParser
+
+from os import environ as env
+from os.path import join as pjoin
+
+
+def sexe(cmd,ret_output=False,echo=False):
+    """ Helper for executing shell commands. """
+    if echo:
+        print("[exe: {}]".format(cmd))
+    if ret_output:
+        p = subprocess.Popen(cmd,
+                             shell=True,
+                             stdout=subprocess.PIPE,
+                             stderr=subprocess.STDOUT)
+        out = p.communicate()[0]
+        out = out.decode('utf8')
+        return p.returncode,out
+    else:
+        return subprocess.call(cmd,shell=True)
+
+
+def parse_args():
+    "Parses args from command line"
+    parser = OptionParser()
+    parser.add_option("--install",
+                      action="store_true",
+                      dest="install",
+                      default=False,
+                      help="Install `package_name`, not just its dependencies.")
+
+    # where to install
+    parser.add_option("--prefix",
+                      dest="prefix",
+                      default="uberenv_libs",
+                      help="destination directory")
+
+    # what compiler to use
+    parser.add_option("--spec",
+                      dest="spec",
+                      default=None,
+                      help="spack compiler spec")
+
+    # optional location of spack mirror
+    parser.add_option("--mirror",
+                      dest="mirror",
+                      default=None,
+                      help="spack mirror directory")
+
+    # flag to create mirror
+    parser.add_option("--create-mirror",
+                      action="store_true",
+                      dest="create_mirror",
+                      default=False,
+                      help="Create spack mirror")
+
+    # optional location of spack upstream
+    parser.add_option("--upstream",
+                      dest="upstream",
+                      default=None,
+                      help="add an external spack instance as upstream")
+
+    # this option allows a user to explicitly to select a
+    # group of spack settings files (compilers.yaml , packages.yaml)
+    parser.add_option("--spack-config-dir",
+                      dest="spack_config_dir",
+                      default=None,
+                      help="dir with spack settings files (compilers.yaml, packages.yaml, etc)")
+
+    # overrides package_name
+    parser.add_option("--package-name",
+                      dest="package_name",
+                      default=None,
+                      help="override the default package name")
+
+    # controls after which package phase spack should stop
+    parser.add_option("--package-final-phase",
+                      dest="package_final_phase",
+                      default=None,
+                      help="override the default phase after which spack should stop")
+
+    # controls source_dir spack should use to build the package
+    parser.add_option("--package-source-dir",
+                      dest="package_source_dir",
+                      default=None,
+                      help="override the default source dir spack should use")
+
+    # a file that holds settings for a specific project
+    # using uberenv.py
+    parser.add_option("--project-json",
+                      dest="project_json",
+                      default=pjoin(uberenv_script_dir(),"project.json"),
+                      help="uberenv project settings json file")
+
+    # flag to use insecure curl + git
+    parser.add_option("-k",
+                      action="store_true",
+                      dest="ignore_ssl_errors",
+                      default=False,
+                      help="Ignore SSL Errors")
+
+    # option to force a spack pull
+    parser.add_option("--pull",
+                      action="store_true",
+                      dest="spack_pull",
+                      default=False,
+                      help="Pull if spack repo already exists")
+
+    # option to force for clean of packages specified to
+    # be cleaned in the project.json
+    parser.add_option("--clean",
+                      action="store_true",
+                      dest="spack_clean",
+                      default=False,
+                      help="Force uninstall of packages specified in project.json")
+
+    # option to tell spack to run tests
+    parser.add_option("--run_tests",
+                      action="store_true",
+                      dest="run_tests",
+                      default=False,
+                      help="Invoke build tests during spack install")
+
+    # option to init osx sdk env flags
+    parser.add_option("--macos-sdk-env-setup",
+                      action="store_true",
+                      dest="macos_sdk_env_setup",
+                      default=False,
+                      help="Set several env vars to select OSX SDK settings."
+                           "This was necessary for older versions of macOS "
+                           " but can cause issues with macOS versions >= 10.13. "
+                           " so it is disabled by default.")
+
+
+    ###############
+    # parse args
+    ###############
+    opts, extras = parser.parse_args()
+    # we want a dict b/c the values could
+    # be passed without using optparse
+    opts = vars(opts)
+    if not opts["spack_config_dir"] is None:
+        opts["spack_config_dir"] = os.path.abspath(opts["spack_config_dir"])
+        if not os.path.isdir(opts["spack_config_dir"]):
+            print("[ERROR: invalid spack config dir: {} ]".format(opts["spack_config_dir"]))
+            sys.exit(-1)
+    return opts, extras
+
+
+def uberenv_script_dir():
+    # returns the directory of the uberenv.py script
+    return os.path.dirname(os.path.abspath(__file__))
+
+def load_json_file(json_file):
+    # reads json file
+    return json.load(open(json_file))
+
+def is_darwin():
+    return "darwin" in platform.system().lower()
+
+def is_windows():
+    return "windows" in platform.system().lower()
+
+class UberEnv():
+    """ Base class for package manager """
+
+    def __init__(self, opts, extra_opts):
+        self.opts = opts
+        self.extra_opts = extra_opts
+
+        # load project settings
+        self.project_opts = load_json_file(opts["project_json"])
+        print("[uberenv project settings: {}]".format(str(self.project_opts)))
+        print("[uberenv options: {}]".format(str(self.opts)))
+
+    def setup_paths_and_dirs(self):
+        self.uberenv_path = os.path.dirname(os.path.realpath(__file__))
+
+    def set_from_args_or_json(self,setting):
+        try:
+            setting_value = self.project_opts[setting]
+        except (KeyError):
+            print("ERROR: {} must at least be defined in project.json".format(setting))
+            raise
+        else:
+            if self.opts[setting]:
+                setting_value = self.opts[setting]
+        return setting_value
+
+    def set_from_json(self,setting):
+        try:
+            setting_value = self.project_opts[setting]
+        except (KeyError):
+            print("ERROR: {} must at least be defined in project.json".format(setting))
+            raise
+        return setting_value
+
+    def detect_platform(self):
+        # find supported sets of compilers.yaml, packages,yaml
+        res = None
+        if is_darwin():
+            res = "darwin"
+        elif "SYS_TYPE" in os.environ.keys():
+            sys_type = os.environ["SYS_TYPE"].lower()
+            res = sys_type
+        return res
+
+
+class SpackEnv(UberEnv):
+    """ Helper to clone spack and install libraries on MacOS an Linux """
+
+    def __init__(self, opts, extra_opts):
+        UberEnv.__init__(self,opts,extra_opts)
+
+        self.pkg_name = self.set_from_args_or_json("package_name")
+        self.pkg_version = self.set_from_json("package_version")
+        self.pkg_final_phase = self.set_from_args_or_json("package_final_phase")
+        self.pkg_src_dir = self.set_from_args_or_json("package_source_dir")
+
+        self.spec_hash = ""
+        self.use_install = False
+
+        # Some additional setup for macos
+        if is_darwin():
+            if opts["macos_sdk_env_setup"]:
+                # setup osx deployment target and sdk settings
+                setup_osx_sdk_env_vars()
+            else:
+                print("[skipping MACOSX env var setup]")
+
+        # setup default spec
+        if opts["spec"] is None:
+            if is_darwin():
+                opts["spec"] = "%clang"
+            else:
+                opts["spec"] = "%gcc"
+            self.opts["spec"] = "@{}{}".format(self.pkg_version,opts["spec"])
+        elif not opts["spec"].startswith("@"):
+            self.opts["spec"] = "@{}{}".format(self.pkg_version,opts["spec"])
+        else:
+            self.opts["spec"] = "{}".format(opts["spec"])
+
+        print("[spack spec: {}]".format(self.opts["spec"]))
+
+    def setup_paths_and_dirs(self):
+        # get the current working path, and the glob used to identify the
+        # package files we want to hot-copy to spack
+
+        UberEnv.setup_paths_and_dirs(self)
+
+        self.pkgs = pjoin(self.uberenv_path, "packages","*")
+
+        # setup destination paths
+        self.dest_dir = os.path.abspath(self.opts["prefix"])
+        self.dest_spack = pjoin(self.dest_dir,"spack")
+        print("[installing to: {0}]".format(self.dest_dir))
+
+        # print a warning if the dest path already exists
+        if not os.path.isdir(self.dest_dir):
+            os.mkdir(self.dest_dir)
+        else:
+            print("[info: destination '{}' already exists]".format(self.dest_dir))
+
+        if os.path.isdir(self.dest_spack):
+            print("[info: destination '{}' already exists]".format(self.dest_spack))
+
+        self.pkg_src_dir = os.path.join(self.uberenv_path,self.pkg_src_dir)
+        if not os.path.isdir(self.pkg_src_dir):
+            print("[ERROR: package_source_dir '{}' does not exist]".format(self.pkg_src_dir))
+            sys.exit(-1)
+
+
+    def find_spack_pkg_path_from_hash(self, pkg_name, pkg_hash):
+        res, out = sexe("spack/bin/spack find -p /{}".format(pkg_hash), ret_output = True)
+        for l in out.split("\n"):
+            if l.startswith(pkg_name):
+                   return {"name": pkg_name, "path": l.split()[-1]}
+        print("[ERROR: failed to find package named '{}']".format(pkg_name))
+        sys.exit(-1)
+
+    def find_spack_pkg_path(self, pkg_name, spec = ""):
+        res, out = sexe("spack/bin/spack find -p " + pkg_name + spec,ret_output = True)
+        for l in out.split("\n"):
+            # TODO: at least print a warning when several choices exist. This will
+            # pick the first in the list.
+            if l.startswith(pkg_name):
+                   return {"name": pkg_name, "path": l.split()[-1]}
+        print("[ERROR: failed to find package named '{}']".format(pkg_name))
+        sys.exit(-1)
+
+    # Extract the first line of the full spec
+    def read_spack_full_spec(self,pkg_name,spec):
+        res, out = sexe("spack/bin/spack spec " + pkg_name + " " + spec, ret_output=True)
+        for l in out.split("\n"):
+            if l.startswith(pkg_name) and l.count("@") > 0 and l.count("arch=") > 0:
+                return l.strip()
+
+    def clone_repo(self):
+        if not os.path.isdir(self.dest_spack):
+
+            # compose clone command for the dest path, spack url and branch
+            print("[info: cloning spack develop branch from github]")
+
+            os.chdir(self.dest_dir)
+
+            clone_opts = ("-c http.sslVerify=false "
+                          if self.opts["ignore_ssl_errors"] else "")
+
+            spack_url = self.project_opts.get("spack_url", "https://github.com/spack/spack.git")
+            spack_branch = self.project_opts.get("spack_branch", "develop")
+
+            clone_cmd =  "git {0} clone -b {1} {2}".format(clone_opts, spack_branch,spack_url)
+            sexe(clone_cmd, echo=True)
+
+        if "spack_commit" in self.project_opts:
+            # optionally, check out a specific commit
+            os.chdir(pjoin(self.dest_dir,"spack"))
+            sha1 = self.project_opts["spack_commit"]
+            res, current_sha1 = sexe("git log -1 --pretty=%H", ret_output=True)
+            if sha1 != current_sha1:
+                print("[info: using spack commit {}]".format(sha1))
+                sexe("git stash", echo=True)
+                sexe("git fetch origin {0}".format(sha1),echo=True)
+                sexe("git checkout {0}".format(sha1),echo=True)
+
+        if self.opts["spack_pull"]:
+            # do a pull to make sure we have the latest
+            os.chdir(pjoin(self.dest_dir,"spack"))
+            sexe("git stash", echo=True)
+            sexe("git pull", echo=True)
+
+    def config_dir(self):
+        """ path to compilers.yaml, which we will use for spack's compiler setup"""
+        spack_config_dir = self.opts["spack_config_dir"]
+        if spack_config_dir is None:
+            uberenv_plat = self.detect_platform()
+            if not uberenv_plat is None:
+                spack_config_dir = os.path.abspath(pjoin(self.uberenv_path,"spack_configs",uberenv_plat))
+        return spack_config_dir
+
+
+    def disable_spack_config_scopes(self,spack_dir):
+        # disables all config scopes except "defaults", which we will
+        # force our settings into
+        spack_lib_config = pjoin(spack_dir,"lib","spack","spack","config.py")
+        print("[disabling config scope (except defaults) in: {}]".format(spack_lib_config))
+        cfg_script = open(spack_lib_config).read()
+        for cfg_scope_stmt in ["('system', os.path.join(spack.paths.system_etc_path, 'spack')),",
+                            "('site', os.path.join(spack.paths.etc_path, 'spack')),",
+                            "('user', spack.paths.user_config_path)"]:
+            cfg_script = cfg_script.replace(cfg_scope_stmt,
+                                            "#DISABLED BY UBERENV: " + cfg_scope_stmt)
+        open(spack_lib_config,"w").write(cfg_script)
+
+
+    def patch(self):
+
+        cfg_dir = self.config_dir()
+        spack_dir = self.dest_spack
+
+        # force spack to use only "defaults" config scope
+        self.disable_spack_config_scopes(spack_dir)
+        spack_etc_defaults_dir = pjoin(spack_dir,"etc","spack","defaults")
+
+        # copy in "defaults" config.yaml
+        config_yaml = os.path.abspath(pjoin(self.uberenv_path,"spack_configs","config.yaml"))
+        sexe("cp {} {}/".format(config_yaml, spack_etc_defaults_dir ), echo=True)
+
+        # copy in other settings per platform
+        if not cfg_dir is None:
+            print("[copying uberenv compiler and packages settings from {0}]".format(cfg_dir))
+
+            config_yaml    = pjoin(cfg_dir,"config.yaml")
+            compilers_yaml = pjoin(cfg_dir,"compilers.yaml")
+            packages_yaml  = pjoin(cfg_dir,"packages.yaml")
+
+            if os.path.isfile(config_yaml):
+                sexe("cp {} {}/".format(config_yaml , spack_etc_defaults_dir ), echo=True)
+
+            if os.path.isfile(compilers_yaml):
+                sexe("cp {} {}/".format(compilers_yaml, spack_etc_defaults_dir ), echo=True)
+
+            if os.path.isfile(packages_yaml):
+                sexe("cp {} {}/".format(packages_yaml, spack_etc_defaults_dir ), echo=True)
+        else:
+            # let spack try to auto find compilers
+            sexe("spack/bin/spack compiler find", echo=True)
+
+        # hot-copy our packages into spack
+        if self.pkgs:
+            dest_spack_pkgs = pjoin(spack_dir,"var","spack","repos","builtin","packages")
+            sexe("cp -Rf {} {}".format(self.pkgs,dest_spack_pkgs))
+
+
+    def clean_build(self):
+        # clean out any temporary spack build stages
+        cln_cmd = "spack/bin/spack clean "
+        res = sexe(cln_cmd, echo=True)
+
+        # clean out any spack cached stuff
+        cln_cmd = "spack/bin/spack clean --all"
+        res = sexe(cln_cmd, echo=True)
+
+        # check if we need to force uninstall of selected packages
+        if self.opts["spack_clean"]:
+            if self.project_opts.has_key("spack_clean_packages"):
+                for cln_pkg in self.project_opts["spack_clean_packages"]:
+                    if not self.find_spack_pkg_path(cln_pkg) is None:
+                        unist_cmd = "spack/bin/spack uninstall -f -y --all --dependents " + cln_pkg
+                        res = sexe(unist_cmd, echo=True)
+
+    def show_info(self):
+        # prints install status and 32 characters hash
+        options="--install-status --very-long"
+        spec_cmd = "spack/bin/spack spec {0} {1}{2}".format(options,self.pkg_name,self.opts["spec"])
+
+        res, out = sexe(spec_cmd, ret_output=True, echo=True)
+        print(out)
+
+        #Check if spec is already installed
+        for line in out.split("\n"):
+            # Example of matching line: ("status"  "hash"  "package"...)
+            # [+]  hf3cubkgl74ryc3qwen73kl4yfh2ijgd  serac@develop%clang@10.0.0-apple~debug~devtools~glvis arch=darwin-mojave-x86_64
+            if re.match(r"^(\[\+\]| - )  [a-z0-9]{32}  " + re.escape(self.pkg_name), line):
+                self.spec_hash = line.split("  ")[1]
+                # if spec already installed
+                if line.startswith("[+]"):
+                    pkg_path = self.find_spack_pkg_path_from_hash(self.pkg_name,self.spec_hash)
+                    install_path = pkg_path["path"]
+                    # testing that the path exists is mandatory until Spack team fixes
+                    # https://github.com/spack/spack/issues/16329
+                    if os.path.isdir(install_path):
+                        print("[Warning: {} {} has already been installed in {}]".format(self.pkg_name, self.opts["spec"],install_path))
+                        print("[Warning: Uberenv will proceed using this directory]".format(self.pkg_name))
+                        self.use_install = True
+
+        return res
+
+    def install(self):
+        # use the uberenv package to trigger the right builds
+        # and build an host-config.cmake file
+
+        if not self.use_install:
+            install_cmd = "spack/bin/spack "
+            if self.opts["ignore_ssl_errors"]:
+                install_cmd += "-k "
+            if not self.opts["install"]:
+                install_cmd += "dev-build --quiet -d {} -u {} ".format(self.pkg_src_dir,self.pkg_final_phase)
+            else:
+                install_cmd += "install "
+                if self.opts["run_tests"]:
+                    install_cmd += "--test=root "
+            install_cmd += self.pkg_name + self.opts["spec"]
+            res = sexe(install_cmd, echo=True)
+
+            if res != 0:
+                print("[ERROR: failure of spack install/dev-build]")
+                return res
+
+        full_spec = self.read_spack_full_spec(self.pkg_name,self.opts["spec"])
+        if "spack_activate" in self.project_opts:
+            print("[activating dependent packages]")
+            # get the full spack spec for our project
+            pkg_names = self.project_opts["spack_activate"].keys()
+            for pkg_name in pkg_names:
+                pkg_spec_requirements = self.project_opts["spack_activate"][pkg_name]
+                activate=True
+                for req in pkg_spec_requirements:
+                    if req not in full_spec:
+                        activate=False
+                        break
+                if activate:
+                    activate_cmd = "spack/bin/spack activate " + pkg_name
+                    sexe(activate_cmd, echo=True)
+        # note: this assumes package extends python when +python
+        # this may fail general cases
+        if self.opts["install"] and "+python" in full_spec:
+            activate_cmd = "spack/bin/spack activate /" + self.spec_hash
+            sexe(activate_cmd, echo=True)
+        # if user opt'd for an install, we want to symlink the final
+        # install to an easy place:
+        if self.opts["install"] or self.use_install:
+            pkg_path = self.find_spack_pkg_path_from_hash(self.pkg_name, self.spec_hash)
+            if self.pkg_name != pkg_path["name"]:
+                print("[ERROR: Could not find install of {}]".format(self.pkg_name))
+                return -1
+            else:
+                # Symlink host-config file
+                hc_glob = glob.glob(pjoin(pkg_path["path"],"*.cmake"))
+                if len(hc_glob) > 0:
+                    hc_path  = hc_glob[0]
+                    hc_fname = os.path.split(hc_path)[1]
+                    if os.path.islink(hc_fname):
+                        os.unlink(hc_fname)
+                    elif os.path.isfile(hc_fname):
+                        sexe("rm -f {}".format(hc_fname))
+                    print("[symlinking host config file to {}]".format(pjoin(self.dest_dir,hc_fname)))
+                    os.symlink(hc_path,hc_fname)
+
+                # Symlink install directory
+                if self.opts["install"]:
+                    pkg_lnk_dir = "{}-install".format(self.pkg_name)
+                    if os.path.islink(pkg_lnk_dir):
+                        os.unlink(pkg_lnk_dir)
+                    print("")
+                    print("[symlinking install to {}]".format(pjoin(self.dest_dir,pkg_lnk_dir)))
+                    os.symlink(pkg_path["path"],os.path.abspath(pkg_lnk_dir))
+                    print("")
+                    print("[install complete!]")
+        # otherwise we are in the "only dependencies" case and the host-config
+        # file has to be copied from the do-be-deleted spack-build dir.
+        else:
+            pattern = "*{}.cmake".format(self.pkg_name)
+            build_dir = pjoin(self.pkg_src_dir,"spack-build")
+            hc_glob = glob.glob(pjoin(build_dir,pattern))
+            if len(hc_glob) > 0:
+                hc_path  = hc_glob[0]
+                hc_fname = os.path.split(hc_path)[1]
+                if os.path.islink(hc_fname):
+                    os.unlink(hc_fname)
+                print("[copying host config file to {}]".format(pjoin(self.dest_dir,hc_fname)))
+                sexe("cp {} {}".format(hc_path,hc_fname))
+                print("[removing project build directory {}]".format(pjoin(build_dir)))
+                sexe("rm -rf {}".format(build_dir))
+
+    def get_mirror_path(self):
+        mirror_path = self.opts["mirror"]
+        if not mirror_path:
+            print("[--create-mirror requires a mirror directory]")
+            sys.exit(-1)
+        return os.path.abspath(mirror_path)
+
+    def create_mirror(self):
+        """
+        Creates a spack mirror for pkg_name at mirror_path.
+        """
+
+        mirror_path = self.get_mirror_path()
+
+        mirror_cmd = "spack/bin/spack "
+        if self.opts["ignore_ssl_errors"]:
+            mirror_cmd += "-k "
+        mirror_cmd += "mirror create -d {} --dependencies {}".format(mirror_path,
+                                                                    self.pkg_name)
+        return sexe(mirror_cmd, echo=True)
+
+    def find_spack_mirror(self, mirror_name):
+        """
+        Returns the path of a defaults scoped spack mirror with the
+        given name, or None if no mirror exists.
+        """
+        res, out = sexe("spack/bin/spack mirror list", ret_output=True)
+        mirror_path = None
+        for mirror in out.split('\n'):
+            if mirror:
+                parts = mirror.split()
+                if parts[0] == mirror_name:
+                    mirror_path = parts[1]
+        return mirror_path
+
+    def use_mirror(self):
+        """
+        Configures spack to use mirror at a given path.
+        """
+        mirror_name = self.pkg_name
+        mirror_path = self.get_mirror_path()
+        existing_mirror_path = self.find_spack_mirror(mirror_name)
+
+        if existing_mirror_path and mirror_path != existing_mirror_path:
+            # Existing mirror has different URL, error out
+            print("[removing existing spack mirror `{}` @ {}]".format(mirror_name,
+                                                                    existing_mirror_path))
+            #
+            # Note: In this case, spack says it removes the mirror, but we still
+            # get errors when we try to add a new one, sounds like a bug
+            #
+            sexe("spack/bin/spack mirror remove --scope=defaults {} ".format(mirror_name),
+                echo=True)
+            existing_mirror_path = None
+        if not existing_mirror_path:
+            # Add if not already there
+            sexe("spack/bin/spack mirror add --scope=defaults {} {}".format(
+                    mirror_name, mirror_path), echo=True)
+            print("[using mirror {}]".format(mirror_path))
+
+    def find_spack_upstream(self, upstream_name):
+        """
+        Returns the path of a defaults scoped spack upstream with the
+        given name, or None if no upstream exists.
+        """
+        upstream_path = None
+
+        res, out = sexe('spack/bin/spack config get upstreams', ret_output=True)
+        if (not out) and ("upstreams:" in out):
+            out = out.replace(' ', '')
+            out = out.replace('install_tree:', '')
+            out = out.replace(':', '')
+            out = out.splitlines()
+            out = out[1:]
+            upstreams = dict(zip(out[::2], out[1::2]))
+
+            for name in upstreams.keys():
+                if name == upstream_name:
+                    upstream_path = upstreams[name]
+
+        return upstream_path
+
+    def use_spack_upstream(self):
+        """
+        Configures spack to use upstream at a given path.
+        """
+        upstream_path = self.opts["upstream"]
+        if not upstream_path:
+            print("[--create-upstream requires a upstream directory]")
+            sys.exit(-1)
+        upstream_path = os.path.abspath(upstream_path)
+        upstream_name = self.pkg_name
+        existing_upstream_path = self.find_spack_upstream(upstream_name)
+        if (not existing_upstream_path) or (upstream_path != os.path.abspath(existing_upstream_path)):
+            # Existing upstream has different URL, error out
+            print("[removing existing spack upstream configuration file]")
+            sexe("rm spack/etc/spack/defaults/upstreams.yaml")
+            with open('spack/etc/spack/defaults/upstreams.yaml','w+') as upstreams_cfg_file:
+                upstreams_cfg_file.write("upstreams:\n")
+                upstreams_cfg_file.write("  {}:\n".format(upstream_name))
+                upstreams_cfg_file.write("    install_tree: {}\n".format(upstream_path))
+
+
+def find_osx_sdks():
+    """
+    Finds installed osx sdks, returns dict mapping version to file system path
+    """
+    res = {}
+    sdks = glob.glob("/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX*.sdk")
+    for sdk in sdks:
+        sdk_base = os.path.split(sdk)[1]
+        ver = sdk_base[len("MacOSX"):sdk_base.rfind(".")]
+        res[ver] = sdk
+    return res
+
+def setup_osx_sdk_env_vars():
+    """
+    Finds installed osx sdks, returns dict mapping version to file system path
+    """
+    # find current osx version (10.11.6)
+    dep_tgt = platform.mac_ver()[0]
+    # sdk file names use short version (ex: 10.11)
+    dep_tgt_short = dep_tgt[:dep_tgt.rfind(".")]
+    # find installed sdks, ideally we want the sdk that matches the current os
+    sdk_root = None
+    sdks = find_osx_sdks()
+    if dep_tgt_short in sdks.keys():
+        # matches our osx, use this one
+        sdk_root = sdks[dep_tgt_short]
+    elif len(sdks) > 0:
+        # for now, choose first one:
+        dep_tgt  = sdks.keys()[0]
+        sdk_root = sdks[dep_tgt]
+    else:
+        # no valid sdks, error out
+        print("[ERROR: Could not find OSX SDK @ /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/]")
+        sys.exit(-1)
+
+    env["MACOSX_DEPLOYMENT_TARGET"] = dep_tgt
+    env["SDKROOT"] = sdk_root
+    print("[setting MACOSX_DEPLOYMENT_TARGET to {}]".format(env["MACOSX_DEPLOYMENT_TARGET"]))
+    print("[setting SDKROOT to {}]".format(env[ "SDKROOT"]))
+
+
+
+def main():
+    """
+    Clones and runs a package manager to setup third_party libs.
+    Also creates a host-config.cmake file that can be used by our project.
+    """
+
+    # parse args from command line
+    opts, extra_opts = parse_args()
+
+    # Initialize the environment
+    env = SpackEnv(opts, extra_opts)
+
+    # Setup the necessary paths and directories
+    env.setup_paths_and_dirs()
+
+    # Clone the package manager
+    env.clone_repo()
+
+    os.chdir(env.dest_dir)
+
+    # Patch the package manager, as necessary
+    env.patch()
+
+    # Clean the build
+    env.clean_build()
+
+    # Show the spec for what will be built
+    env.show_info()
+
+
+    ##########################################################
+    # we now have an instance of spack configured how we
+    # need it to build our tpls at this point there are two
+    # possible next steps:
+    #
+    # *) create a mirror of the packages
+    #   OR
+    # *) build
+    #
+    ##########################################################
+    if opts["create_mirror"]:
+        return env.create_mirror()
+    else:
+        if not opts["mirror"] is None:
+            env.use_mirror()
+
+        if not opts["upstream"] is None:
+            env.use_spack_upstream()
+
+        res = env.install()
+
+        return res
+
+if __name__ == "__main__":
+    sys.exit(main())
+
+
diff --git a/scripts/ubuntu-builds/ubuntu_clang.sh b/scripts/ubuntu-builds/ubuntu_clang.sh
new file mode 100755
index 0000000000..044d7d43ac
--- /dev/null
+++ b/scripts/ubuntu-builds/ubuntu_clang.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+BUILD_SUFFIX=ubuntu-clang
+
+rm -rf build_${BUILD_SUFFIX} 2>/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+# module load cmake/3.14.5
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_C_COMPILER=/usr/bin/clang \
+  -DCMAKE_CXX_COMPILER=/usr/bin/clang++ \
+  -C ../host-configs/ubuntu-builds/clang_X.cmake \
+  -DENABLE_OPENMP=On \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  .. 
diff --git a/scripts/ubuntu-builds/ubuntu_gcc.sh b/scripts/ubuntu-builds/ubuntu_gcc.sh
new file mode 100755
index 0000000000..8b469b732f
--- /dev/null
+++ b/scripts/ubuntu-builds/ubuntu_gcc.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+BUILD_SUFFIX=ubuntu-gcc
+
+rm -rf build_${BUILD_SUFFIX} 2>/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+# module load cmake/3.14.5
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_C_COMPILER=/usr/bin/gcc \
+  -DCMAKE_CXX_COMPILER=/usr/bin/g++ \
+  -C ../host-configs/ubuntu-builds/gcc_X.cmake \
+  -DENABLE_OPENMP=On \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
diff --git a/scripts/lc-builds/corona_hipcc.sh b/scripts/ubuntu-builds/ubuntu_hipcc.sh
old mode 100644
new mode 100755
similarity index 51%
rename from scripts/lc-builds/corona_hipcc.sh
rename to scripts/ubuntu-builds/ubuntu_hipcc.sh
index 51479964a9..733f7522ff
--- a/scripts/lc-builds/corona_hipcc.sh
+++ b/scripts/ubuntu-builds/ubuntu_hipcc.sh
@@ -7,23 +7,12 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-BUILD_SUFFIX=lc_corona-hipcc
+BUILD_SUFFIX=ubuntu-hipcc
 
 rm -rf build_${BUILD_SUFFIX} >/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-#============= For LC Corona ONLY =============
-export PATH=/usr/workspace/wsb/raja-dev/opt/hip-clang/bin:$PATH
-export HIP_CLANG_PATH=/usr/workspace/wsb/raja-dev/opt/llvm/bin
-export DEVICE_LIB_PATH=/usr/workspace/wsb/raja-dev/opt/lib
-export HCC_AMDGPU_TARGET=gfx900
-module load opt
-module load dts/7.1
-module load rocm
-#==============================================
-
-module load cmake/3.14.5
-
 cmake \
-  -C ../host-configs/hip.cmake \
+  -DCMAKE_BUILD_TYPE=Debug \
+  -C ../host-configs/ubuntu-builds/hip.cmake \
   ..
diff --git a/scripts/ubuntu-builds/ubuntu_nvcc10_gcc8.sh b/scripts/ubuntu-builds/ubuntu_nvcc10_gcc8.sh
new file mode 100755
index 0000000000..95f415c357
--- /dev/null
+++ b/scripts/ubuntu-builds/ubuntu_nvcc10_gcc8.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+BUILD_SUFFIX=ubuntu-nvcc10-gcc8
+
+rm -rf build_${BUILD_SUFFIX} >/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_C_COMPILER=/usr/bin/gcc-8 \
+  -DCMAKE_CXX_COMPILER=/usr/bin/g++-8 \
+  -C ../host-configs/ubuntu-builds/nvcc_gcc_X.cmake \
+  -DENABLE_OPENMP=On \
+  -DENABLE_CUDA=On \
+  -DCMAKE_CUDA_COMPILER=/usr/bin/nvcc \
+  -DCUDA_ARCH=sm_70 \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
diff --git a/src/AlignedRangeIndexSetBuilders.cpp b/src/AlignedRangeIndexSetBuilders.cpp
index a4ae9b1aef..121033ea70 100644
--- a/src/AlignedRangeIndexSetBuilders.cpp
+++ b/src/AlignedRangeIndexSetBuilders.cpp
@@ -3,7 +3,7 @@
  *
  * \file
  *
- * \brief   Implementation file for aligned-range index set builder methods.
+ * \brief   Implmentation file for aligned range index set builder methods.
  *
  ******************************************************************************
  */
@@ -15,46 +15,58 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include <iostream>
+
+#include "RAJA/index/IndexSetBuilders.hpp"
+
 #include "RAJA/index/IndexSet.hpp"
 #include "RAJA/index/ListSegment.hpp"
 #include "RAJA/index/RangeSegment.hpp"
 
+#include "camp/resource.hpp"
+
 namespace RAJA
 {
 
 /*
-*************************************************************************
-*
-* Initialize index set with aligned Ranges and List segments from array
-* of indices with given length.
-*
-*************************************************************************
-*/
-
+ ******************************************************************************
+ *
+ * Generate an index set with aligned Range segments and List segments,
+ * as needed, from given array of indices.
+ *
+ ******************************************************************************
+ */
 void buildIndexSetAligned(
-    RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment>& hiset,
-    const Index_type* const indices_in,
-    Index_type length)
+    RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment>& iset,
+    camp::resources::Resource& work_res,
+    const RAJA::Index_type* const indices_in,
+    RAJA::Index_type length,
+    RAJA::Index_type range_min_length,
+    RAJA::Index_type range_align)
 {
   if (length == 0) return;
 
   /* only transform relatively large */
-  if (length > RANGE_MIN_LENGTH) {
+  if (length > range_min_length) {
     /* build a rindex array from an index array */
-    Index_type docount = 0;
-    Index_type inrange = -1;
+    RAJA::Index_type docount = 0;
+    RAJA::Index_type inrange = -1;
 
     /****************************/
     /* first, gather statistics */
     /****************************/
 
-    Index_type scanVal = indices_in[0];
-    Index_type sliceCount = 0;
-    for (Index_type ii = 1; ii < length; ++ii) {
-      Index_type lookAhead = indices_in[ii];
+    RAJA::Index_type scanVal = indices_in[0];
+    RAJA::Index_type sliceCount = 0;
+    for (RAJA::Index_type ii = 1; ii < length; ++ii) {
+      RAJA::Index_type lookAhead = indices_in[ii];
 
       if (inrange == -1) {
-        if ((lookAhead == scanVal + 1) && ((scanVal % RANGE_ALIGN) == 0)) {
+        if ((lookAhead == scanVal + 1) && ((scanVal % range_align) == 0)) {
           inrange = 1;
         } else {
           inrange = 0;
@@ -62,7 +74,7 @@ void buildIndexSetAligned(
       }
 
       if (lookAhead == scanVal + 1) {
-        if ((inrange == 0) && ((scanVal % RANGE_ALIGN) == 0)) {
+        if ((inrange == 0) && ((scanVal % range_align) == 0)) {
           if (sliceCount != 0) {
             docount += 1 + sliceCount; /* length + singletons */
           }
@@ -106,7 +118,7 @@ void buildIndexSetAligned(
     ++docount; /* zero length termination */
 
     /* What is the cutoff criteria for generating the rindex array? */
-    if (docount < (length * (RANGE_ALIGN - 1)) / RANGE_ALIGN) {
+    if (docount < (length * (range_align - 1)) / range_align) {
       /* The rindex array can either contain a pointer into the */
       /* original index array, *or* it can repack the data from the */
       /* original index array.  Benefits of repacking could include */
@@ -117,17 +129,17 @@ void buildIndexSetAligned(
       /* now, build the rindex array */
       /*******************************/
 
-      Index_type dobegin;
+      RAJA::Index_type dobegin;
       inrange = -1;
 
       scanVal = indices_in[0];
       sliceCount = 0;
       dobegin = scanVal;
-      for (Index_type ii = 1; ii < length; ++ii) {
-        Index_type lookAhead = indices_in[ii];
+      for (RAJA::Index_type ii = 1; ii < length; ++ii) {
+        RAJA::Index_type lookAhead = indices_in[ii];
 
         if (inrange == -1) {
-          if ((lookAhead == scanVal + 1) && ((scanVal % RANGE_ALIGN) == 0)) {
+          if ((lookAhead == scanVal + 1) && ((scanVal % range_align) == 0)) {
             inrange = 1;
           } else {
             inrange = 0;
@@ -135,9 +147,10 @@ void buildIndexSetAligned(
           }
         }
         if (lookAhead == scanVal + 1) {
-          if ((inrange == 0) && ((scanVal % RANGE_ALIGN) == 0)) {
+          if ((inrange == 0) && ((scanVal % range_align) == 0)) {
             if (sliceCount != 0) {
-              hiset.push_back(ListSegment(&indices_in[dobegin], sliceCount));
+              iset.push_back(ListSegment(&indices_in[dobegin], sliceCount,
+                                          work_res));
             }
             inrange = 1;
             dobegin = scanVal;
@@ -154,7 +167,7 @@ void buildIndexSetAligned(
             /* we need to emit a random array instead of */
             /* a range array */
             ++sliceCount;
-            hiset.push_back(RangeSegment(dobegin, dobegin + sliceCount));
+            iset.push_back(RangeSegment(dobegin, dobegin + sliceCount));
             inrange = 0;
             sliceCount = 0;
             dobegin = ii;
@@ -164,24 +177,25 @@ void buildIndexSetAligned(
         }
 
         scanVal = lookAhead;
-      }  // for (Index_type ii ...
+      }  // for (RAJA::Index_type ii ...
 
       if (inrange != -1) {
         if (inrange) {
           ++sliceCount;
-          hiset.push_back(RangeSegment(dobegin, dobegin + sliceCount));
+          iset.push_back(RangeSegment(dobegin, dobegin + sliceCount));
         } else {
           ++sliceCount;
-          hiset.push_back(ListSegment(&indices_in[dobegin], sliceCount));
+          iset.push_back(ListSegment(&indices_in[dobegin], sliceCount,
+                                      work_res));
         }
       } else if (scanVal != -1) {
-        hiset.push_back(ListSegment(&scanVal, 1));
+        iset.push_back(ListSegment(&scanVal, 1, work_res));
       }
-    } else {  // !(docount < (length*RANGE_ALIGN-1))/RANGE_ALIGN)
-      hiset.push_back(ListSegment(indices_in, length));
+    } else {  // !(docount < (length*range_align-1))/range_align)
+      iset.push_back(ListSegment(indices_in, length, work_res));
     }
-  } else {  // else !(length > RANGE_MIN_LENGTH)
-    hiset.push_back(ListSegment(indices_in, length));
+  } else {  // else !(length > range_min_length)
+    iset.push_back(ListSegment(indices_in, length, work_res));
   }
 }
 
diff --git a/src/KokkosPluginLoader.cpp b/src/KokkosPluginLoader.cpp
new file mode 100644
index 0000000000..e796ab817a
--- /dev/null
+++ b/src/KokkosPluginLoader.cpp
@@ -0,0 +1,149 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "RAJA/util/KokkosPluginLoader.hpp"
+
+#ifndef _WIN32
+#include <dlfcn.h>
+#include <dirent.h>
+#endif
+
+const uint64_t kokkos_interface_version = 20171029;
+
+RAJA_INLINE
+bool
+isSharedObject(const std::string& filename)
+{
+  return (filename.size() > 3 && !filename.compare(filename.size() - 3, 3, ".so"));
+}
+
+template<typename function>
+RAJA_INLINE
+void
+getFunction(void* plugin, std::vector<function>& functions, const char* fname)
+{
+  #ifndef _WIN32
+  function func = (function) dlsym(plugin, fname);
+  if (func)
+    functions.push_back(func);
+  else
+    printf("[KokkosPluginLoader]: dlsym failed: %s\n", dlerror());
+  #else
+  RAJA_UNUSED_ARG(plugin);
+  RAJA_UNUSED_ARG(functions);
+  RAJA_UNUSED_ARG(fname);
+  #endif
+}
+
+namespace RAJA {
+namespace util {
+
+KokkosPluginLoader::KokkosPluginLoader()
+{
+  char *env = getenv("KOKKOS_PLUGINS");
+  if (env == nullptr)
+  {
+    return;
+  }
+  initDirectory(std::string(env));
+
+  for (auto &func : init_functions)
+  {
+    func(0, kokkos_interface_version, 0, nullptr);
+  }
+}
+
+void KokkosPluginLoader::preLaunch(const RAJA::util::PluginContext& p)
+{
+  for (auto &func : pre_functions)
+  {
+    func("", 0, &(p.kID));
+  }
+}
+
+void KokkosPluginLoader::postLaunch(const RAJA::util::PluginContext& p)
+{
+  for (auto &func : post_functions)
+  {
+    func(p.kID);
+  }
+}
+
+void KokkosPluginLoader::finalize()
+{
+  for (auto &func : finalize_functions)
+  {
+    func();
+  }
+  init_functions.clear();
+  pre_functions.clear();
+  post_functions.clear();
+  finalize_functions.clear();
+}
+
+// Initialize plugin from a shared object file specified by 'path'.
+void KokkosPluginLoader::initPlugin(const std::string &path)
+{
+  #ifndef _WIN32
+  void *plugin = dlopen(path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+  if (!plugin)
+  {
+    printf("[KokkosPluginLoader]: dlopen failed: %s\n", dlerror());
+  }
+
+  // Getting and storing supported kokkos functions.
+  getFunction<init_function>(plugin, init_functions, "kokkosp_init_library");
+
+  getFunction<pre_function>(plugin, pre_functions, "kokkosp_begin_parallel_for");
+
+  getFunction<post_function>(plugin, post_functions, "kokkosp_end_parallel_for");
+
+  getFunction<finalize_function>(plugin, finalize_functions, "kokkosp_finalize_library");
+  #else
+  RAJA_UNUSED_ARG(path);
+  #endif
+}
+
+// Initialize all plugins in a directory specified by 'path'.
+void KokkosPluginLoader::initDirectory(const std::string &path)
+{
+  #ifndef _WIN32
+  if (isSharedObject(path))
+  {
+    initPlugin(path);
+    return;
+  }
+  
+  DIR *dir;
+  struct dirent *file;
+
+  if ((dir = opendir(path.c_str())) != NULL)
+  {
+    while ((file = readdir(dir)) != NULL)
+    {
+      if (isSharedObject(std::string(file->d_name)))
+      {
+        initPlugin(path + "/" + file->d_name);
+      }
+    }
+    closedir(dir);
+  }
+  else
+  {
+    perror("[KokkosPluginLoader]: Could not open plugin directory");
+  }
+  #else
+  RAJA_UNUSED_ARG(path);
+  #endif
+}
+
+void linkKokkosPluginLoader() {}
+
+} // end namespace util
+} // end namespace RAJA
+
+static RAJA::util::PluginRegistry::add<RAJA::util::KokkosPluginLoader> P("KokkosPluginLoader", "Dynamically load plugins ported from the Kokkos library.");
diff --git a/src/LockFreeIndexSetBuilders.cpp b/src/LockFreeIndexSetBuilders.cpp
index 40e8ae4f15..e9f2ebb738 100644
--- a/src/LockFreeIndexSetBuilders.cpp
+++ b/src/LockFreeIndexSetBuilders.cpp
@@ -21,38 +21,35 @@
 
 #include <iostream>
 
+#include "RAJA/index/IndexSetBuilders.hpp"
+
 #include "RAJA/index/IndexSet.hpp"
 #include "RAJA/index/ListSegment.hpp"
 #include "RAJA/index/RangeSegment.hpp"
 
 #include "RAJA/internal/ThreadUtils_CPU.hpp"
 
+#include "camp/resource.hpp"
+
 namespace RAJA
 {
 
-/*
- * See buildLockFreeIndexSet.hxx for other comments.
- */
-
 /*
  ******************************************************************************
  *
- * Build Lock-free "block" index set (planar division).
- *
- * Note: Method assumes IndexSet ptr refers to an empty index set.
+ * Generate a lock-free "block" index set (planar division) containing
+ * range segments. 
  *
  ******************************************************************************
  */
-#define PROFITABLE_ENTITY_THRESHOLD_BLOCK 100
-
 void buildLockFreeBlockIndexset(
-    RAJA::TypedIndexSet<RAJA::RangeSegment,
-                        RAJA::ListSegment,
-                        RAJA::RangeStrideSegment>& iset,
-    Index_type fastDim,
-    Index_type midDim,
-    Index_type slowDim)
+    RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
+    int fastDim,
+    int midDim,
+    int slowDim)
 {
+  constexpr int PROFITABLE_ENTITY_THRESHOLD_BLOCK = 100;
+
   int numThreads = getMaxOMPThreadsCPU();
 
   // printf("Lock-free created\n") ;
@@ -73,8 +70,8 @@ void buildLockFreeBlockIndexset(
       int numSegments = numThreads * 3;
       for (int lane = 0; lane < 3; ++lane) {
         for (int i = lane; i < numSegments; i += 3) {
-          Index_type start = i * fastDim / numSegments;
-          Index_type end = (i + 1) * fastDim / numSegments;
+          RAJA::Index_type start = i * fastDim / numSegments;
+          RAJA::Index_type end = (i + 1) * fastDim / numSegments;
           // printf("%d %d\n", start, end) ;
           iset.push_back(RAJA::RangeSegment(start, end));
         }
@@ -96,11 +93,11 @@ void buildLockFreeBlockIndexset(
       /* now use the brain dead approach. */
       for (int lane = 0; lane < 3; ++lane) {
         for (int i = 0; i < numThreads; ++i) {
-          Index_type startRow = i * midDim / numThreads;
-          Index_type endRow = (i + 1) * midDim / numThreads;
-          Index_type start = startRow * fastDim;
-          Index_type end = endRow * fastDim;
-          Index_type len = end - start;
+          RAJA::Index_type startRow = i * midDim / numThreads;
+          RAJA::Index_type endRow = (i + 1) * midDim / numThreads;
+          RAJA::Index_type start = startRow * fastDim;
+          RAJA::Index_type end = endRow * fastDim;
+          RAJA::Index_type len = end - start;
           // printf("%d %d\n", start + (lane  )*len/3,
           //                   start + (lane+1)*len/3  ) ;
           iset.push_back(RAJA::RangeSegment(start + (lane)*len / 3,
@@ -136,11 +133,11 @@ void buildLockFreeBlockIndexset(
     /*
           for (int lane = 0; lane < segmentsPerThread; ++lane) {
             for (int i = 0; i < numThreads; ++i) {
-              Index_type startPlane = i * slowDim / numThreads;
-              Index_type endPlane = (i + 1) * slowDim / numThreads;
-              Index_type start = startPlane * fastDim * midDim;
-              Index_type end = endPlane * fastDim * midDim;
-              Index_type len = end - start;
+              RAJA::Index_type startPlane = i * slowDim / numThreads;
+              RAJA::Index_type endPlane = (i + 1) * slowDim / numThreads;
+              RAJA::Index_type start = startPlane * fastDim * midDim;
+              RAJA::Index_type end = endPlane * fastDim * midDim;
+              RAJA::Index_type len = end - start;
               // printf("%d %d\n", start + (lane  )*len/segmentsPerThread,
               //                   start + (lane+1)*len/segmentsPerThread  );
               iset.push_back(
@@ -198,45 +195,40 @@ void buildLockFreeBlockIndexset(
 /*
  ******************************************************************************
  *
- * Build Lock-free "color" index set. The domain-set is colored based on
- * connectivity to the range-set.  All elements in each segment are
- * independent, and no two segments can be executed in parallel.
- *
- * Note: Method assumes IndexSet ptr refers to an empty index set.
+ * Generate a lock-free "color" index set containing range and list segments.
  *
  ******************************************************************************
  */
 void buildLockFreeColorIndexset(
-    RAJA::TypedIndexSet<RAJA::RangeSegment,
-                        RAJA::ListSegment,
-                        RAJA::RangeStrideSegment>& iset,
-    Index_type const* domainToRange,
+    RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment>& iset,
+    camp::resources::Resource& work_res,
+    RAJA::Index_type const* domainToRange,
     int numEntity,
     int numRangePerDomain,
     int numEntityRange,
-    Index_type* elemPermutation,
-    Index_type* ielemPermutation)
+    RAJA::Index_type* elemPermutation,
+    RAJA::Index_type* ielemPermutation)
 {
   bool done = false;
   bool* isMarked = new bool[numEntity];
 
-  Index_type numWorkset = 0;
-  Index_type* worksetDelim = new Index_type[numEntity];
+  RAJA::Index_type numWorkset = 0;
+  RAJA::Index_type* worksetDelim = new RAJA::Index_type[numEntity];
 
-  Index_type worksetSize = 0;
-  Index_type* workset = new Index_type[numEntity];
+  RAJA::Index_type worksetSize = 0;
+  RAJA::Index_type* workset = new RAJA::Index_type[numEntity];
 
-  Index_type* rangeToDomain =
-      new Index_type[numEntityRange * numRangePerDomain];
-  Index_type* rangeToDomainCount = new Index_type[numEntityRange];
+  RAJA::Index_type* rangeToDomain =
+      new RAJA::Index_type[numEntityRange * numRangePerDomain];
+  RAJA::Index_type* rangeToDomainCount = new RAJA::Index_type[numEntityRange];
 
-  memset(rangeToDomainCount, 0, numEntityRange * sizeof(Index_type));
+  memset(rangeToDomainCount, 0, numEntityRange * sizeof(RAJA::Index_type));
 
   /* create an inverse mapping */
   for (int i = 0; i < numEntity; ++i) {
     for (int j = 0; j < numRangePerDomain; ++j) {
-      Index_type id = domainToRange[i * numRangePerDomain + j];
-      Index_type idx = id * numRangePerDomain + rangeToDomainCount[id]++;
+      RAJA::Index_type id = domainToRange[i * numRangePerDomain + j];
+      RAJA::Index_type idx = id * numRangePerDomain + rangeToDomainCount[id]++;
       if (idx > numEntityRange * numRangePerDomain ||
           rangeToDomainCount[id] > numRangePerDomain) {
         printf("foiled!\n");
@@ -266,9 +258,9 @@ void buildLockFreeColorIndexset(
         }
         workset[worksetSize++] = i;
         for (int j = 0; j < numRangePerDomain; ++j) {
-          Index_type id = domainToRange[i * numRangePerDomain + j];
+          RAJA::Index_type id = domainToRange[i * numRangePerDomain + j];
           for (int k = 0; k < rangeToDomainCount[id]; ++k) {
-            Index_type idx = rangeToDomain[id * numRangePerDomain + k];
+            RAJA::Index_type idx = rangeToDomain[id * numRangePerDomain + k];
             if (idx < 0 || idx >= numEntity) {
               printf("foiled!\n");
               exit(-1);
@@ -301,16 +293,16 @@ void buildLockFreeColorIndexset(
         ielemPermutation[elemPermutation[i]] = i;
       }
     }
-    Index_type end = 0;
+    RAJA::Index_type end = 0;
     for (int i = 0; i < numWorkset; ++i) {
-      Index_type begin = end;
+      RAJA::Index_type begin = end;
       end = worksetDelim[i];
       iset.push_back(RAJA::RangeSegment(begin, end));
     }
   } else {
-    Index_type end = 0;
+    RAJA::Index_type end = 0;
     for (int i = 0; i < numWorkset; ++i) {
-      Index_type begin = end;
+      RAJA::Index_type begin = end;
       end = worksetDelim[i];
       bool isRange = true;
       for (int j = begin + 1; j < end; ++j) {
@@ -323,7 +315,8 @@ void buildLockFreeColorIndexset(
         iset.push_back(
             RAJA::RangeSegment(workset[begin], workset[end - 1] + 1));
       } else {
-        iset.push_back(RAJA::ListSegment(&workset[begin], end - begin));
+        iset.push_back(RAJA::ListSegment(&workset[begin], end - begin,
+                                         work_res));
         // printf("segment %d\n", i) ;
         // for (int j=begin; j<end; ++j) {
         //    printf("%d\n", workset[j]) ;
diff --git a/src/PluginStrategy.cpp b/src/PluginStrategy.cpp
index 25a218ed46..882b31e989 100644
--- a/src/PluginStrategy.cpp
+++ b/src/PluginStrategy.cpp
@@ -14,5 +14,17 @@ namespace util {
 
 PluginStrategy::PluginStrategy() = default;
 
+void PluginStrategy::init(const PluginOptions&) { }
+
+void PluginStrategy::preCapture(const PluginContext&) { }
+
+void PluginStrategy::postCapture(const PluginContext&) { }
+
+void PluginStrategy::preLaunch(const PluginContext&) { }
+
+void PluginStrategy::postLaunch(const PluginContext&) { }
+
+void PluginStrategy::finalize() { }
+
 }
 }
diff --git a/src/RuntimePluginLoader.cpp b/src/RuntimePluginLoader.cpp
new file mode 100644
index 0000000000..2286783a77
--- /dev/null
+++ b/src/RuntimePluginLoader.cpp
@@ -0,0 +1,148 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "RAJA/util/RuntimePluginLoader.hpp"
+
+#ifndef _WIN32
+#include <dlfcn.h>
+#include <dirent.h>
+#endif
+
+RAJA_INLINE
+bool
+isSharedObject(const std::string& filename)
+{
+  return (filename.size() > 3 && !filename.compare(filename.size() - 3, 3, ".so"));
+}
+
+namespace RAJA {
+namespace util {
+  
+RuntimePluginLoader::RuntimePluginLoader()
+{
+  char *env = ::getenv("RAJA_PLUGINS");
+  if (nullptr == env)
+  {
+    return;
+  }
+  initDirectory(std::string(env));
+}
+
+void RuntimePluginLoader::init(const RAJA::util::PluginOptions& p)
+{
+  initDirectory(p.str);
+  for (auto &plugin : plugins)
+  {
+    plugin->init(p);
+  }
+}
+
+void RuntimePluginLoader::preCapture(const RAJA::util::PluginContext& p)
+{
+  for (auto &plugin : plugins)
+  {
+    plugin->preCapture(p);
+  }
+}
+
+void RuntimePluginLoader::postCapture(const RAJA::util::PluginContext& p)
+{
+  for (auto &plugin : plugins)
+  {
+    plugin->postCapture(p);
+  }
+}
+
+void RuntimePluginLoader::preLaunch(const RAJA::util::PluginContext& p)
+{
+  for (auto &plugin : plugins)
+  {
+    plugin->preLaunch(p);
+  }
+}
+
+void RuntimePluginLoader::postLaunch(const RAJA::util::PluginContext& p)
+{
+  for (auto &plugin : plugins)
+  {
+    plugin->postLaunch(p);
+  }
+}
+
+void RuntimePluginLoader::finalize()
+{
+  for (auto &plugin : plugins)
+  {
+    plugin->finalize();
+  }
+  plugins.clear();
+}
+
+// Initialize plugin from a shared object file specified by 'path'.
+void RuntimePluginLoader::initPlugin(const std::string &path)
+{
+  #ifndef _WIN32
+  void *plugin = dlopen(path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+  if (!plugin)
+  {
+    printf("[RuntimePluginLoader]: dlopen failed: %s\n", dlerror());
+  }
+
+  RuntimePluginLoader::Parent *(*getPlugin)() = (RuntimePluginLoader::Parent * (*)()) dlsym(plugin, "getPlugin");
+
+  if (getPlugin)
+  {
+    plugins.push_back(std::unique_ptr<RuntimePluginLoader::Parent>(getPlugin()));
+  }
+  else
+  {
+    printf("[RuntimePluginLoader]: dlsym failed: %s\n", dlerror());
+  }
+  #else
+  RAJA_UNUSED_ARG(path);
+  #endif
+}
+
+// Initialize all plugins in a directory specified by 'path'.
+void RuntimePluginLoader::initDirectory(const std::string &path)
+{
+  #ifndef _WIN32
+  if (isSharedObject(path))
+  {
+    initPlugin(path);
+    return;
+  }
+  
+  DIR *dir;
+  struct dirent *file;
+
+  if ((dir = opendir(path.c_str())) != NULL)
+  {
+    while ((file = readdir(dir)) != NULL)
+    {
+      if (isSharedObject(std::string(file->d_name)))
+      {
+        initPlugin(path + "/" + file->d_name);
+      }
+    }
+    closedir(dir);
+  }
+  else
+  {
+    perror("[RuntimePluginLoader]: Could not open plugin directory");
+  }
+  #else
+  RAJA_UNUSED_ARG(path);
+  #endif
+}
+
+void linkRuntimePluginLoader() {}
+
+} // end namespace util
+} // end namespace RAJA
+
+static RAJA::util::PluginRegistry::add<RAJA::util::RuntimePluginLoader> P("RuntimePluginLoader", "Dynamically load RAJA plugins.");
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 14687b101a..72c691d07a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -7,6 +7,11 @@
 
 include_directories(include)
 
+add_subdirectory(integration)
+
+add_subdirectory(functional)
+
 add_subdirectory(unit)
 
-add_subdirectory(integration)
+add_subdirectory(old-tests)
+
diff --git a/test/functional/CMakeLists.txt b/test/functional/CMakeLists.txt
new file mode 100644
index 0000000000..139c4aaa28
--- /dev/null
+++ b/test/functional/CMakeLists.txt
@@ -0,0 +1,18 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+add_subdirectory(forall)
+
+add_subdirectory(indexset-build)
+
+add_subdirectory(kernel)
+
+add_subdirectory(scan)
+
+add_subdirectory(workgroup)
+
+add_subdirectory(teams)
diff --git a/test/functional/forall/CMakeLists.txt b/test/functional/forall/CMakeLists.txt
new file mode 100644
index 0000000000..22d064936c
--- /dev/null
+++ b/test/functional/forall/CMakeLists.txt
@@ -0,0 +1,84 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+list(APPEND FORALL_BACKENDS Sequential)
+
+if(RAJA_ENABLE_OPENMP)
+  list(APPEND FORALL_BACKENDS OpenMP)
+endif()
+
+if(RAJA_ENABLE_TBB)
+  list(APPEND FORALL_BACKENDS TBB)
+endif()
+
+if(RAJA_ENABLE_CUDA)
+  list(APPEND FORALL_BACKENDS Cuda)
+endif()
+
+if(RAJA_ENABLE_HIP)
+  list(APPEND FORALL_BACKENDS Hip)
+endif()
+
+if(RAJA_ENABLE_TARGET_OPENMP)
+  list(APPEND FORALL_BACKENDS OpenMPTarget)
+endif()
+
+add_subdirectory(indexset)
+add_subdirectory(indexset-view)
+
+add_subdirectory(segment)
+add_subdirectory(segment-view)
+
+add_subdirectory(reduce-basic)
+add_subdirectory(reduce-multiple-segment)
+add_subdirectory(reduce-multiple-indexset)
+
+add_subdirectory(resource-indexset)
+add_subdirectory(resource-segment)
+
+unset( FORALL_BACKENDS ) 
+
+
+#
+# Note: Forall atomic tests use their own backend list since atomic 
+#       constructs are defined for only some of the RAJA back-ends.
+#
+list(APPEND FORALL_ATOMIC_BACKENDS Sequential)
+
+if(RAJA_ENABLE_OPENMP)
+  list(APPEND FORALL_ATOMIC_BACKENDS OpenMP)
+endif()
+
+if(RAJA_ENABLE_TBB)
+# TBB atomics are not available in RAJA currently
+#  list(APPEND FORALL_BACKENDS TBB)
+endif()
+
+if(RAJA_ENABLE_CUDA)
+  list(APPEND FORALL_ATOMIC_BACKENDS Cuda)
+endif()
+
+if(RAJA_ENABLE_HIP)
+  list(APPEND FORALL_ATOMIC_BACKENDS Hip)
+endif()
+
+if(RAJA_ENABLE_TARGET_OPENMP)
+  list(APPEND FORALL_ATOMIC_BACKENDS OpenMPTarget)
+endif()
+
+add_subdirectory(atomic-basic)
+add_subdirectory(atomic-view)
+add_subdirectory(atomic-ref)
+
+unset( FORALL_ATOMIC_BACKENDS )
+
+#
+# Note: Forall region tests define their backend list in the region
+#       test directory since region constructs are defined for only 
+#       some of the RAJA back-ends.
+#
+add_subdirectory(region)
diff --git a/test/functional/forall/atomic-basic/CMakeLists.txt b/test/functional/forall/atomic-basic/CMakeLists.txt
new file mode 100644
index 0000000000..52a7e8dbc8
--- /dev/null
+++ b/test/functional/forall/atomic-basic/CMakeLists.txt
@@ -0,0 +1,21 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# Generate tests for each enabled RAJA back-end.
+#
+# Note: FORALL_ATOMIC_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( ATOMIC_BACKEND ${FORALL_ATOMIC_BACKENDS} )
+  configure_file( test-forall-atomic-basic.cpp.in
+                  test-forall-atomic-basic-${ATOMIC_BACKEND}.cpp )
+  raja_add_test( NAME test-forall-atomic-basic-${ATOMIC_BACKEND}
+                 SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-atomic-basic-${ATOMIC_BACKEND}.cpp )
+
+  target_include_directories(test-forall-atomic-basic-${ATOMIC_BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+endforeach()
diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-basic.cpp.in b/test/functional/forall/atomic-basic/test-forall-atomic-basic.cpp.in
new file mode 100644
index 0000000000..6b1b966574
--- /dev/null
+++ b/test/functional/forall/atomic-basic/test-forall-atomic-basic.cpp.in
@@ -0,0 +1,49 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-atomic-types.hpp"
+#include "RAJA_test-atomicpol.hpp"
+
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-forall-data.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-forall-atomic-basic.hpp"
+
+//
+// These tests exercise only one index type. We parameterize here to
+// make it easier to expand types in the future if needed.
+//
+using TestIdxTypeList = camp::list< RAJA::Index_type >;
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @ATOMIC_BACKEND@ForallAtomicBasicTypes =
+  Test< camp::cartesian_product<@ATOMIC_BACKEND@ForallAtomicExecPols,
+                                @ATOMIC_BACKEND@AtomicPols,
+                                @ATOMIC_BACKEND@ResourceList,
+                                TestIdxTypeList,
+                                AtomicDataTypeList > >::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@ATOMIC_BACKEND@,
+                               ForallAtomicBasicTest,
+                               @ATOMIC_BACKEND@ForallAtomicBasicTypes);
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
new file mode 100644
index 0000000000..ff555fe76a
--- /dev/null
+++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
@@ -0,0 +1,175 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing basic functional tests for atomic operations with forall.
+///
+
+#ifndef __TEST_FORALL_ATOMIC_BASIC_HPP__
+#define __TEST_FORALL_ATOMIC_BASIC_HPP__
+
+#include <numeric>
+
+// range segment multiplexer
+template< typename IdxType, typename SegType >
+struct RSMultiplexer {};
+
+template< typename IdxType >
+struct RSMultiplexer < IdxType, RAJA::TypedRangeSegment<IdxType> >
+{
+  RAJA::TypedRangeSegment<IdxType>
+  makeseg( IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res) )
+  {
+    return RAJA::TypedRangeSegment<IdxType>( 0, N );
+  }
+};
+
+template< typename IdxType >
+struct RSMultiplexer < IdxType, RAJA::TypedRangeStrideSegment<IdxType> >
+{
+  RAJA::TypedRangeStrideSegment<IdxType>
+  makeseg( IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res) )
+  {
+    return RAJA::TypedRangeStrideSegment<IdxType>( 0, N, 1 );
+  }
+};
+
+template< typename IdxType >
+struct RSMultiplexer < IdxType, RAJA::TypedListSegment<IdxType> >
+{
+  RAJA::TypedListSegment<IdxType>
+  makeseg( IdxType N, camp::resources::Resource work_res )
+  {
+    std::vector<IdxType> temp(N);
+    std::iota( std::begin(temp), std::end(temp), 0 );
+    return RAJA::TypedListSegment<IdxType>( &temp[0], static_cast<size_t>(temp.size()), work_res );
+  }
+};
+// end range segment multiplexer
+
+
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename WORKINGRES,
+          typename IdxType,
+          typename SegmentType,
+          typename T>
+void ForallAtomicBasicTestImpl( IdxType seglimit )
+{
+  // initialize an array
+  const int len = 10;
+
+  camp::resources::Resource work_res{WORKINGRES()};
+
+  SegmentType seg = 
+    RSMultiplexer<IdxType, SegmentType>().makeseg(seglimit, work_res);
+
+  T * work_array;
+  T * test_array;
+  T * check_array;
+
+  allocateForallTestData<T>(  len,
+                              work_res,
+                              &work_array,
+                              &check_array,
+                              &test_array );
+
+  work_res.memcpy( work_array, test_array, sizeof(T) * len );
+
+#if defined(RAJA_ENABLE_CUDA)
+  cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+  hipErrchk(hipDeviceSynchronize());
+#endif
+
+  // use atomic add to reduce the array
+  test_array[0] = (T)0;
+  test_array[1] = (T)seglimit;
+  test_array[2] = (T)seglimit;
+  test_array[3] = (T)0;
+  test_array[4] = (T)0;
+  test_array[5] = (T)0;
+  test_array[6] = (T)seglimit + 1;
+  test_array[7] = (T)0;
+  test_array[8] = (T)seglimit;
+  test_array[9] = (T)0;
+
+  work_res.memcpy( work_array, test_array, sizeof(T) * len );
+
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+    RAJA::atomicAdd<AtomicPolicy>(work_array + 0, (T)1);
+    RAJA::atomicSub<AtomicPolicy>(work_array + 1, (T)1);
+    RAJA::atomicMin<AtomicPolicy>(work_array + 2, (T)i);
+    RAJA::atomicMax<AtomicPolicy>(work_array + 3, (T)i);
+    RAJA::atomicInc<AtomicPolicy>(work_array + 4);
+    RAJA::atomicInc<AtomicPolicy>(work_array + 5, (T)16);
+    RAJA::atomicDec<AtomicPolicy>(work_array + 6);
+    RAJA::atomicDec<AtomicPolicy>(work_array + 7, (T)16);
+    RAJA::atomicExchange<AtomicPolicy>(work_array + 8, (T)i);
+    RAJA::atomicCAS<AtomicPolicy>(work_array + 9, (T)i, (T)(i+1));
+  });
+
+  work_res.memcpy( check_array, work_array, sizeof(T) * len );
+
+#if defined(RAJA_ENABLE_CUDA)
+  cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+  hipErrchk(hipDeviceSynchronize());
+#endif
+
+  EXPECT_EQ((T)seglimit, check_array[0]);
+  EXPECT_EQ((T)0, check_array[1]);
+  EXPECT_EQ((T)0, check_array[2]);
+  EXPECT_EQ((T)seglimit - 1, check_array[3]);
+  EXPECT_EQ((T)seglimit, check_array[4]);
+  EXPECT_EQ((T)4, check_array[5]);
+  EXPECT_EQ((T)1, check_array[6]);
+  EXPECT_EQ((T)13, check_array[7]);
+  EXPECT_LE((T)0, check_array[8]);
+  EXPECT_GT((T)seglimit, check_array[8]);
+  EXPECT_LT((T)0, check_array[9]);
+  EXPECT_GE((T)seglimit, check_array[9]);
+
+  deallocateForallTestData<T>(  work_res,
+                                work_array,
+                                check_array,
+                                test_array );
+}
+
+TYPED_TEST_SUITE_P(ForallAtomicBasicTest);
+template <typename T>
+class ForallAtomicBasicTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallAtomicBasicTest, AtomicBasicForall)
+{
+  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
+  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  ForallAtomicBasicTestImpl<AExec, APol, ResType, 
+                            IdxType, RAJA::TypedRangeSegment<IdxType>, 
+                            DType>( 10000 );
+  ForallAtomicBasicTestImpl<AExec, APol, ResType, 
+                            IdxType, RAJA::TypedRangeStrideSegment<IdxType>, 
+                            DType>( 10000 );
+  ForallAtomicBasicTestImpl<AExec, APol, ResType, 
+                            IdxType, RAJA::TypedListSegment<IdxType>, 
+                            DType>( 10000 );
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicBasicTest,
+                            AtomicBasicForall);
+
+#endif  //__TEST_FORALL_ATOMIC_BASIC_HPP__
diff --git a/test/functional/forall/atomic-ref/CMakeLists.txt b/test/functional/forall/atomic-ref/CMakeLists.txt
new file mode 100644
index 0000000000..c405cbd886
--- /dev/null
+++ b/test/functional/forall/atomic-ref/CMakeLists.txt
@@ -0,0 +1,30 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of atomic test types for generating test files.
+#
+set(TESTTYPES AtomicRefMath AtomicRefOther)
+
+#
+# Generate atomicref tests for each enabled RAJA back-end.
+#
+# Note: FORALL_ATOMIC_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( ATOMIC_BACKEND ${FORALL_ATOMIC_BACKENDS} )
+  foreach( TEST ${TESTTYPES} )
+    configure_file( test-forall-atomicref.cpp.in
+                    test-forall-${TEST}-${ATOMIC_BACKEND}.cpp )
+    raja_add_test( NAME test-forall-${TEST}-${ATOMIC_BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-${TEST}-${ATOMIC_BACKEND}.cpp )
+
+    target_include_directories(test-forall-${TEST}-${ATOMIC_BACKEND}.exe
+                                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( TESTTYPES )
diff --git a/test/functional/forall/atomic-ref/test-forall-atomicref.cpp.in b/test/functional/forall/atomic-ref/test-forall-atomicref.cpp.in
new file mode 100644
index 0000000000..e59d9dcc64
--- /dev/null
+++ b/test/functional/forall/atomic-ref/test-forall-atomicref.cpp.in
@@ -0,0 +1,50 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-atomic-types.hpp"
+#include "RAJA_test-atomicpol.hpp"
+
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-forall-data.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-forall-@TEST@.hpp"
+
+//
+// These tests exercise only one index type. We parameterize here to
+// make it easier to expand types in the future if needed.
+//
+using TestIdxTypeList = camp::list< RAJA::Index_type >;
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @ATOMIC_BACKEND@Forall@TEST@Types =
+  Test< camp::cartesian_product<@ATOMIC_BACKEND@ForallAtomicExecPols,
+                                @ATOMIC_BACKEND@AtomicPols,
+                                @ATOMIC_BACKEND@ResourceList,
+                                TestIdxTypeList,
+                                AtomicDataTypeList > >::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@ATOMIC_BACKEND@,
+                               Forall@TEST@Test,
+                               @ATOMIC_BACKEND@Forall@TEST@Types);
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMath.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMath.hpp
new file mode 100644
index 0000000000..87064dd9e9
--- /dev/null
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMath.hpp
@@ -0,0 +1,222 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing basic functional tests for arithmetic atomic operations using forall
+///
+
+#ifndef __TEST_FORALL_ATOMICREF_MATH_HPP__
+#define __TEST_FORALL_ATOMICREF_MATH_HPP__
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct PreIncCountOp {
+  PreIncCountOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : counter(count), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
+  { count[0] = (T)0; }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
+      return (++counter) - (T)1;
+    }
+  RAJA::AtomicRef<T, AtomicPolicy> counter;
+  T min, max, final;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct PostIncCountOp {
+  PostIncCountOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : counter(count), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
+  { count[0] = (T)0; }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
+      return (counter++);
+    }
+  RAJA::AtomicRef<T, AtomicPolicy> counter;
+  T min, max, final;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct AddEqCountOp {
+  AddEqCountOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : counter(count), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
+  { count[0] = (T)0; }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
+      return (counter += (T)1) - (T)1;
+    }
+  RAJA::AtomicRef<T, AtomicPolicy> counter;
+  T min, max, final;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct FetchAddCountOp {
+  FetchAddCountOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : counter(count), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
+  { count[0] = (T)0; }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
+      return counter.fetch_add((T)1);
+    }
+  RAJA::AtomicRef<T, AtomicPolicy> counter;
+  T min, max, final;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct PreDecCountOp {
+  PreDecCountOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : counter(count), min((T)0), max((T)seg.size()-(T)1), final((T)0)
+  { count[0] = (T)seg.size(); }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
+      return (--counter);
+    }
+  RAJA::AtomicRef<T, AtomicPolicy> counter;
+  T min, max, final;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct PostDecCountOp {
+  PostDecCountOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : counter(count), min((T)0), max((T)seg.size()-(T)1), final((T)0)
+  { count[0] = (T)seg.size(); }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
+      return (counter--) - (T)1;
+    }
+  RAJA::AtomicRef<T, AtomicPolicy> counter;
+  T min, max, final;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct SubEqCountOp {
+  SubEqCountOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : counter(count), min((T)0), max((T)seg.size()-(T)1), final((T)0)
+  { count[0] = (T)seg.size(); }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
+      return (counter -= (T)1);
+    }
+  RAJA::AtomicRef<T, AtomicPolicy> counter;
+  T min, max, final;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct FetchSubCountOp {
+  FetchSubCountOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : counter(count), min((T)0), max((T)seg.size()-(T)1), final((T)0)
+  { count[0] = (T)seg.size(); }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
+      return counter.fetch_sub((T)1) - (T)1;
+    }
+  RAJA::AtomicRef<T, AtomicPolicy> counter;
+  T min, max, final;
+};
+
+template <typename ExecPolicy,
+         typename AtomicPolicy,
+         typename IdxType,
+         typename T,
+         template <typename, typename, typename> class CountOp>
+void testAtomicRefCount(RAJA::TypedRangeSegment<IdxType> seg,
+    T* count, T* list, bool* hit)
+{
+  CountOp<T, AtomicPolicy, IdxType> countop(count, seg);
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+      list[i] = countop.max + (T)1;
+      hit[i] = false;
+      });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+      T val = countop(i);
+      list[i] = val;
+      hit[(IdxType)val] = true;
+      });
+#if defined(RAJA_ENABLE_CUDA)
+  cudaErrchk(cudaDeviceSynchronize());
+#endif
+#if defined(RAJA_ENABLE_HIP)
+  hipErrchk(hipDeviceSynchronize());
+#endif
+
+  EXPECT_EQ(countop.final, count[0]);
+  for (IdxType i = 0; i < seg.size(); i++) {
+    EXPECT_LE(countop.min, list[i]);
+    EXPECT_GE(countop.max, list[i]);
+    EXPECT_TRUE(hit[i]);
+  }
+}
+
+
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename WORKINGRES,
+          typename IdxType,
+          typename T>
+void ForallAtomicRefMathTestImpl( IdxType N )
+{
+  RAJA::TypedRangeSegment<IdxType> seg(0, N);
+
+  camp::resources::Resource count_res{WORKINGRES()};
+  camp::resources::Resource list_res{WORKINGRES()};
+  camp::resources::Resource hit_res{WORKINGRES()};
+
+  T * count   = count_res.allocate<T>(1);
+  T * list    = list_res.allocate<T>(N);
+  bool * hit  = hit_res.allocate<bool>(N);
+
+#if defined(RAJA_ENABLE_CUDA)
+  cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+  hipErrchk(hipDeviceSynchronize());
+#endif
+
+  testAtomicRefCount<ExecPolicy, AtomicPolicy, IdxType, T, 
+                     PreIncCountOp  >(seg, count, list, hit);
+  testAtomicRefCount<ExecPolicy, AtomicPolicy, IdxType, T, 
+                     PostIncCountOp >(seg, count, list, hit);
+  testAtomicRefCount<ExecPolicy, AtomicPolicy, IdxType, T, 
+                     AddEqCountOp   >(seg, count, list, hit);
+  testAtomicRefCount<ExecPolicy, AtomicPolicy, IdxType, T, 
+                     FetchAddCountOp>(seg, count, list, hit);
+
+  testAtomicRefCount<ExecPolicy, AtomicPolicy, IdxType, T, 
+                     PreDecCountOp  >(seg, count, list, hit);
+  testAtomicRefCount<ExecPolicy, AtomicPolicy, IdxType, T, 
+                     PostDecCountOp >(seg, count, list, hit);
+  testAtomicRefCount<ExecPolicy, AtomicPolicy, IdxType, T, 
+                     SubEqCountOp   >(seg, count, list, hit);
+  testAtomicRefCount<ExecPolicy, AtomicPolicy, IdxType, T, 
+                     FetchSubCountOp>(seg, count, list, hit);
+
+  count_res.deallocate( count );
+  list_res.deallocate( list );
+  hit_res.deallocate( hit );
+}
+
+
+TYPED_TEST_SUITE_P(ForallAtomicRefMathTest);
+template <typename T>
+class ForallAtomicRefMathTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallAtomicRefMathTest, AtomicRefMathForall)
+{
+  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
+  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  ForallAtomicRefMathTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefMathTest,
+                            AtomicRefMathForall);
+
+#endif  //__TEST_FORALL_ATOMICREF_MATH_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefOther.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefOther.hpp
new file mode 100644
index 0000000000..e67f4571a2
--- /dev/null
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefOther.hpp
@@ -0,0 +1,468 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing basic functional tests for non-arithmetic atomic operations using forall
+///
+
+#ifndef __TEST_FORALL_ATOMICREF_OTHER_HPP__
+#define __TEST_FORALL_ATOMICREF_OTHER_HPP__
+
+#include <type_traits>
+
+template < typename T >
+RAJA_INLINE
+RAJA_HOST_DEVICE
+typename std::enable_if<sizeof(T) == 1, T>::type np2m1(T val)
+{
+  val |= val >> 1  ;
+  val |= val >> 2  ;
+  val |= val >> 4  ;
+  return val;
+}
+
+template < typename T >
+RAJA_INLINE
+RAJA_HOST_DEVICE
+typename std::enable_if<sizeof(T) == 2, T>::type np2m1(T val)
+{
+  val |= val >> 1  ;
+  val |= val >> 2  ;
+  val |= val >> 4  ;
+  val |= val >> 8  ;
+  return val;
+}
+
+template < typename T >
+RAJA_INLINE
+RAJA_HOST_DEVICE
+typename std::enable_if<sizeof(T) == 4, T>::type np2m1(T val)
+{
+  val |= val >> 1  ;
+  val |= val >> 2  ;
+  val |= val >> 4  ;
+  val |= val >> 8  ;
+  val |= val >> 16 ;
+  return val;
+}
+
+template < typename T >
+RAJA_INLINE
+RAJA_HOST_DEVICE
+typename std::enable_if<sizeof(T) == 8, T>::type np2m1(T val)
+{
+  val |= val >> 1  ;
+  val |= val >> 2  ;
+  val |= val >> 4  ;
+  val |= val >> 8  ;
+  val |= val >> 16 ;
+  val |= val >> 32 ;
+  return val;
+}
+
+template < typename T >
+RAJA_INLINE
+RAJA_HOST_DEVICE
+typename std::enable_if<sizeof(T) == 16, T>::type np2m1(T val)
+{
+  val |= val >> 1  ;
+  val |= val >> 2  ;
+  val |= val >> 4  ;
+  val |= val >> 8  ;
+  val |= val >> 16 ;
+  val |= val >> 32 ;
+  val |= val >> 64 ;
+  return val;
+}
+
+// Assist return type conditional overloading of testAtomicRefOtherOp
+struct int_op {}; // represents underlying op type = integral
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct AndEqOtherOp : int_op {
+  AndEqOtherOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(count), min(T(0)), max((T)seg.size()),
+    final_min(min), final_max(min)
+  { count[0] = np2m1((T)seg.size()); }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType i) const
+    { return other &= (T)i; }
+  RAJA::AtomicRef<T, AtomicPolicy> other;
+  T min, max, final_min, final_max;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct FetchAndOtherOp : int_op {
+  FetchAndOtherOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(count), min(T(0)), max(np2m1((T)seg.size())),
+    final_min(min), final_max(min)
+  { count[0] = max; }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType i) const
+    { return other.fetch_and((T)i); }
+  RAJA::AtomicRef<T, AtomicPolicy> other;
+  T min, max, final_min, final_max;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct OrEqOtherOp : int_op {
+  OrEqOtherOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(count), min(T(0)), max(np2m1((T)seg.size())),
+    final_min(max), final_max(max)
+  { count[0] = T(0); }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType i) const
+    { return other |= (T)i; }
+  RAJA::AtomicRef<T, AtomicPolicy> other;
+  T min, max, final_min, final_max;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct FetchOrOtherOp : int_op {
+  FetchOrOtherOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(count), min(T(0)), max(np2m1((T)seg.size())),
+    final_min(max), final_max(max)
+  { count[0] = T(0); }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType i) const
+    { return other.fetch_or((T)i); }
+  RAJA::AtomicRef<T, AtomicPolicy> other;
+  T min, max, final_min, final_max;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct XorEqOtherOp : int_op {
+  XorEqOtherOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(count), min(T(0)), max(np2m1((T)seg.size())),
+    final_min(min), final_max(min)
+  { count[0] = T(0);
+    for (IdxType i = 0; i < seg.size(); ++i) {
+      final_min ^= (T)i; final_max ^= (T)i;
+    } }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType i) const
+    { return other ^= (T)i; }
+  RAJA::AtomicRef<T, AtomicPolicy> other;
+  T min, max, final_min, final_max;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct FetchXorOtherOp : int_op {
+  FetchXorOtherOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(count), min(T(0)), max(np2m1((T)seg.size())),
+    final_min(min), final_max(min)
+  { count[0] = T(0);
+    for (IdxType i = 0; i < seg.size(); ++i) {
+      final_min ^= (T)i; final_max ^= (T)i;
+    } }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType i) const
+    { return other.fetch_xor((T)i); }
+  RAJA::AtomicRef<T, AtomicPolicy> other;
+  T min, max, final_min, final_max;
+};
+
+// Assist return type conditional overloading of testAtomicRefOtherOp
+struct all_op {}; // these op types can accept integral or float
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct LoadOtherOp : all_op {
+  LoadOtherOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(count), min((T)seg.size()), max(min),
+    final_min(min), final_max(min)
+  { count[0] = min; }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const
+    { return other.load(); }
+  RAJA::AtomicRef<T, AtomicPolicy> other;
+  T min, max, final_min, final_max;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct OperatorTOtherOp : all_op {
+  OperatorTOtherOp(T* count, RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg))
+    : other(count), min(T(0)), max(min),
+    final_min(min), final_max(min)
+  { count[0] = min; }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const
+    { return other; }
+  RAJA::AtomicRef<T, AtomicPolicy> other;
+  T min, max, final_min, final_max;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct StoreOtherOp : all_op {
+  StoreOtherOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(count), min((T)0), max((T)seg.size() - (T)1),
+    final_min(min), final_max(max)
+  { count[0] = (T)seg.size(); }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType i) const
+    { other.store((T)i); return (T)i; }
+  RAJA::AtomicRef<T, AtomicPolicy> other;
+  T min, max, final_min, final_max;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct AssignOtherOp : all_op {
+  AssignOtherOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(count), min(T(0)), max((T)seg.size() - (T)1),
+    final_min(min), final_max(max)
+  { count[0] = (T)seg.size(); }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType i) const
+    { return (other = (T)i); }
+  RAJA::AtomicRef<T, AtomicPolicy> other;
+  T min, max, final_min, final_max;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct CASOtherOp : all_op {
+  CASOtherOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(count), min((T)0), max((T)seg.size() - (T)1),
+    final_min(min), final_max(max)
+  { count[0] = (T)0; }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType i) const
+    {
+      T received, expect = (T)0;
+      while ((received = other.CAS(expect, (T)i)) != expect) {
+        expect = received;
+      }
+      return received;
+    }
+  RAJA::AtomicRef<T, AtomicPolicy> other;
+  T min, max, final_min, final_max;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct CompareExchangeWeakOtherOp : all_op {
+  CompareExchangeWeakOtherOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(count), min((T)0), max((T)seg.size() - (T)1),
+    final_min(min), final_max(max)
+  { count[0] = (T)0; }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType i) const
+    {
+      T expect = (T)0;
+      while (!other.compare_exchange_weak(expect, (T)i)) {}
+      return expect;
+    }
+  RAJA::AtomicRef<T, AtomicPolicy> other;
+  T min, max, final_min, final_max;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct CompareExchangeStrongOtherOp : all_op {
+  CompareExchangeStrongOtherOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(count), min((T)0), max((T)seg.size() - (T)1),
+    final_min(min), final_max(max)
+  { count[0] = (T)0; }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType i) const
+    {
+      T expect = (T)0;
+      while (!other.compare_exchange_strong(expect, (T)i)) {}
+      return expect;
+    }
+  RAJA::AtomicRef<T, AtomicPolicy> other;
+  T min, max, final_min, final_max;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct MaxEqOtherOp : all_op {
+  MaxEqOtherOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(count), min(T(0)), max((T)seg.size() - (T)1),
+    final_min(max), final_max(max)
+  { count[0] = (T)0; }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType i) const
+    { return other.max((T)i); }
+  RAJA::AtomicRef<T, AtomicPolicy> other;
+  T min, max, final_min, final_max;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct FetchMaxOtherOp : all_op {
+  FetchMaxOtherOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(count), min(T(0)), max((T)seg.size() - (T)1),
+    final_min(max), final_max(max)
+  { count[0] = (T)0; }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType i) const
+    { return other.fetch_max((T)i); }
+  RAJA::AtomicRef<T, AtomicPolicy> other;
+  T min, max, final_min, final_max;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct MinEqOtherOp : all_op {
+  MinEqOtherOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(count), min(T(0)), max((T)seg.size() - (T)1),
+    final_min(min), final_max(min)
+  { count[0] = (T)seg.size(); }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType i) const
+    { return other.min((T)i); }
+  RAJA::AtomicRef<T, AtomicPolicy> other;
+  T min, max, final_min, final_max;
+};
+
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct FetchMinOtherOp : all_op {
+  FetchMinOtherOp(T* count, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(count), min(T(0)), max((T)seg.size()),
+    final_min(min), final_max(min)
+  { count[0] = (T)seg.size(); }
+  RAJA_HOST_DEVICE
+    T operator()(IdxType i) const
+    { return other.fetch_min((T)i); }
+  RAJA::AtomicRef<T, AtomicPolicy> other;
+  T min, max, final_min, final_max;
+};
+
+template  < typename ExecPolicy,
+            typename AtomicPolicy,
+            typename IdxType,
+            typename T,
+            template <typename, typename, typename> class OtherOp>
+// No test when underlying op type is int, and index type is float
+typename std::enable_if<
+           (std::is_floating_point<T>::value && 
+            std::is_base_of<int_op, OtherOp<T,AtomicPolicy, IdxType>>::value)
+         >::type
+testAtomicRefOtherOp(RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg), 
+                     T* RAJA_UNUSED_ARG(count), T* RAJA_UNUSED_ARG(list))
+{
+}
+
+template  < typename ExecPolicy,
+            typename AtomicPolicy,
+            typename IdxType,
+            typename T,
+            template <typename, typename, typename> class OtherOp>
+// Run test if T is integral and operation is int_op, or for any all_op
+typename std::enable_if<
+           (std::is_integral<T>::value && 
+            std::is_base_of<int_op, OtherOp<T,AtomicPolicy, IdxType>>::value) || 
+            (std::is_base_of<all_op, OtherOp<T,AtomicPolicy, IdxType>>::value)
+         >::type
+testAtomicRefOtherOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list)
+{
+  OtherOp<T, AtomicPolicy, IdxType> otherop(count, seg);
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+      list[i] = otherop.max + (T)1;
+  });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+      T val = otherop(i);
+      list[i] = val;
+  });
+#if defined(RAJA_ENABLE_CUDA)
+  cudaErrchk(cudaDeviceSynchronize());
+#endif
+#if defined(RAJA_ENABLE_HIP)
+  hipErrchk(hipDeviceSynchronize());
+#endif
+  EXPECT_LE(otherop.final_min, count[0]);
+  EXPECT_GE(otherop.final_max, count[0]);
+  for (IdxType i = 0; i < seg.size(); i++) {
+    EXPECT_LE(otherop.min, list[i]);
+    EXPECT_GE(otherop.max, list[i]);
+  }
+}
+
+
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename WORKINGRES,
+          typename IdxType,
+          typename T>
+void ForallAtomicRefOtherTestImpl( IdxType N )
+{
+  RAJA::TypedRangeSegment<IdxType> seg(0, N);
+
+  camp::resources::Resource count_res{WORKINGRES()};
+  camp::resources::Resource list_res{WORKINGRES()};
+
+  T * count   = count_res.allocate<T>(1);
+  T * list    = list_res.allocate<T>(N);
+
+#if defined(RAJA_ENABLE_CUDA)
+  cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+  hipErrchk(hipDeviceSynchronize());
+#endif
+
+  testAtomicRefOtherOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       LoadOtherOp     >(seg, count, list);
+  testAtomicRefOtherOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       OperatorTOtherOp>(seg, count, list);
+  testAtomicRefOtherOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       StoreOtherOp    >(seg, count, list);
+  testAtomicRefOtherOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       AssignOtherOp   >(seg, count, list);
+
+  testAtomicRefOtherOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       CASOtherOp                  >(seg, count, list);
+  testAtomicRefOtherOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       CompareExchangeWeakOtherOp  >(seg, count, list);
+  testAtomicRefOtherOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       CompareExchangeStrongOtherOp>(seg, count, list);
+
+  testAtomicRefOtherOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       MaxEqOtherOp   >(seg, count, list);
+  testAtomicRefOtherOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       FetchMaxOtherOp>(seg, count, list);
+  testAtomicRefOtherOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       MinEqOtherOp   >(seg, count, list);
+  testAtomicRefOtherOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       FetchMinOtherOp>(seg, count, list);
+
+  // Note: These integral tests require return type conditional overloading 
+  //       of testAtomicRefOtherOp
+  testAtomicRefOtherOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       AndEqOtherOp   >(seg, count, list);
+  testAtomicRefOtherOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       FetchAndOtherOp>(seg, count, list);
+  testAtomicRefOtherOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       OrEqOtherOp    >(seg, count, list);
+  testAtomicRefOtherOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       FetchOrOtherOp >(seg, count, list);
+  testAtomicRefOtherOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       XorEqOtherOp   >(seg, count, list);
+  testAtomicRefOtherOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       FetchXorOtherOp>(seg, count, list);
+
+  count_res.deallocate( count );
+  list_res.deallocate( list );
+}
+
+
+TYPED_TEST_SUITE_P(ForallAtomicRefOtherTest);
+template <typename T>
+class ForallAtomicRefOtherTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallAtomicRefOtherTest, AtomicRefOtherForall)
+{
+  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
+  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  ForallAtomicRefOtherTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefOtherTest,
+                            AtomicRefOtherForall);
+
+#endif  //__TEST_FORALL_ATOMICREF_OTHER_HPP__
diff --git a/test/functional/forall/atomic-view/CMakeLists.txt b/test/functional/forall/atomic-view/CMakeLists.txt
new file mode 100644
index 0000000000..5905ca21d2
--- /dev/null
+++ b/test/functional/forall/atomic-view/CMakeLists.txt
@@ -0,0 +1,21 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# Generate tests for each enabled RAJA back-end.
+#
+# Note: FORALL_ATOMIC_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( ATOMIC_BACKEND ${FORALL_ATOMIC_BACKENDS} )
+  configure_file( test-forall-atomic-view.cpp.in
+                  test-forall-atomic-view-${ATOMIC_BACKEND}.cpp )
+  raja_add_test( NAME test-forall-atomic-view-${ATOMIC_BACKEND}
+                 SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-atomic-view-${ATOMIC_BACKEND}.cpp )
+
+  target_include_directories(test-forall-atomic-view-${ATOMIC_BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+endforeach()
diff --git a/test/functional/forall/atomic-view/test-forall-atomic-view.cpp.in b/test/functional/forall/atomic-view/test-forall-atomic-view.cpp.in
new file mode 100644
index 0000000000..fab5bd8445
--- /dev/null
+++ b/test/functional/forall/atomic-view/test-forall-atomic-view.cpp.in
@@ -0,0 +1,50 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-atomic-types.hpp"
+#include "RAJA_test-atomicpol.hpp"
+
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-forall-data.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-forall-atomic-view.hpp"
+
+//
+// These tests exercise only one index type. We parameterize here to
+// make it easier to expand types in the future if needed.
+//
+using TestIdxTypeList = camp::list< RAJA::Index_type >;
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @ATOMIC_BACKEND@ForallAtomicViewTypes =
+  Test< camp::cartesian_product<@ATOMIC_BACKEND@ForallAtomicExecPols,
+                                @ATOMIC_BACKEND@AtomicPols,
+                                @ATOMIC_BACKEND@ResourceList,
+                                TestIdxTypeList,
+                                AtomicDataTypeList > >::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@ATOMIC_BACKEND@,
+                               ForallAtomicViewTest,
+                               @ATOMIC_BACKEND@ForallAtomicViewTypes);
diff --git a/test/functional/forall/atomic-view/tests/test-forall-atomic-view.hpp b/test/functional/forall/atomic-view/tests/test-forall-atomic-view.hpp
new file mode 100644
index 0000000000..3ac47df2f7
--- /dev/null
+++ b/test/functional/forall/atomic-view/tests/test-forall-atomic-view.hpp
@@ -0,0 +1,99 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing basic functional tests for atomic operations with forall and views.
+///
+
+#ifndef __TEST_FORALL_ATOMIC_VIEW_HPP__
+#define __TEST_FORALL_ATOMIC_VIEW_HPP__
+
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename WORKINGRES,
+          typename IdxType,
+          typename T>
+void ForallAtomicViewTestImpl( IdxType N )
+{
+  RAJA::TypedRangeSegment<IdxType> seg(0, N);
+  RAJA::TypedRangeSegment<IdxType> seg_half(0, N / 2);
+
+  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource host_res{camp::resources::Host()};
+
+  T * source = work_res.allocate<T>(N);
+  T * dest = work_res.allocate<T>(N/2);
+  T * check_array = host_res.allocate<T>(N/2);
+
+#if defined(RAJA_ENABLE_CUDA)
+  cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+  hipErrchk(hipDeviceSynchronize());
+#endif
+
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](IdxType i) { source[i] = (T)1; });
+
+  // use atomic add to reduce the array
+  RAJA::View<T, RAJA::Layout<1>> vec_view(source, N);
+
+  RAJA::View<T, RAJA::Layout<1>> sum_view(dest, N);
+  auto sum_atomic_view = RAJA::make_atomic_view<AtomicPolicy>(sum_view);
+
+
+  // Zero out dest using atomic view
+  RAJA::forall<ExecPolicy>(seg_half, [=] RAJA_HOST_DEVICE(IdxType i) {
+    sum_atomic_view(i) = (T)0;
+  });
+
+  // Assign values to dest using atomic view
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+    sum_atomic_view(i / 2) += vec_view(i);
+  });
+
+  work_res.memcpy( check_array, dest, sizeof(T) * N/2 );
+
+#if defined(RAJA_ENABLE_CUDA)
+  cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+  hipErrchk(hipDeviceSynchronize());
+#endif
+
+  for (IdxType i = 0; i < N / 2; ++i) {
+    EXPECT_EQ((T)2, check_array[i]);
+  }
+
+  work_res.deallocate( source );
+  work_res.deallocate( dest );
+  host_res.deallocate( check_array );
+}
+
+TYPED_TEST_SUITE_P(ForallAtomicViewTest);
+template <typename T>
+class ForallAtomicViewTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallAtomicViewTest, AtomicViewForall)
+{
+  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
+  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  ForallAtomicViewTestImpl<AExec, APol, ResType, IdxType, DType>( 100000 );
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicViewTest,
+                            AtomicViewForall);
+
+#endif  //__TEST_FORALL_ATOMIC_VIEW_HPP__
diff --git a/test/functional/forall/indexset-view/CMakeLists.txt b/test/functional/forall/indexset-view/CMakeLists.txt
new file mode 100644
index 0000000000..4072748a55
--- /dev/null
+++ b/test/functional/forall/indexset-view/CMakeLists.txt
@@ -0,0 +1,31 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of indexset test types for generating test files.
+#
+set(INDEXSETTESTTYPES IndexSetView IcountIndexSetView)
+
+
+#
+# Generate tests for each enabled RAJA back-end.
+#
+# Note: FORALL_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( BACKEND ${FORALL_BACKENDS} )
+  foreach( INDEXSETTESTTYPE ${INDEXSETTESTTYPES} )
+    configure_file( test-forall-indexset-view.cpp.in
+                    test-forall-${INDEXSETTESTTYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-forall-${INDEXSETTESTTYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-${INDEXSETTESTTYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-forall-${INDEXSETTESTTYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( INDEXSETTESTTYPES )
diff --git a/test/functional/forall/indexset-view/test-forall-indexset-view.cpp.in b/test/functional/forall/indexset-view/test-forall-indexset-view.cpp.in
new file mode 100644
index 0000000000..c2f90c92c7
--- /dev/null
+++ b/test/functional/forall/indexset-view/test-forall-indexset-view.cpp.in
@@ -0,0 +1,42 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+
+#include "RAJA_test-forall-indexset-execpol.hpp"
+#include "RAJA_test-indexset-build.hpp"
+
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-forall-@INDEXSETTESTTYPE@.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@ForallIndexSetViewTypes =
+  Test< camp::cartesian_product<IdxTypeList,
+                                @BACKEND@ResourceList,
+                                @BACKEND@ForallIndexSetExecPols>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               Forall@INDEXSETTESTTYPE@Test,
+                               @BACKEND@ForallIndexSetViewTypes);
diff --git a/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp b/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
new file mode 100644
index 0000000000..11f57f29e0
--- /dev/null
+++ b/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
@@ -0,0 +1,104 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_ICOUNT_INDEXSET_VIEW_HPP__
+#define __TEST_FORALL_ICOUNT_INDEXSET_VIEW_HPP__
+
+#include "RAJA_test-indexset-build.hpp"
+
+#include <cstdio>
+#include <algorithm>
+#include <vector>
+
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+void ForallIcountIndexSetViewTestImpl()
+{
+
+  using RangeSegType       = RAJA::TypedRangeSegment<INDEX_TYPE>;
+  using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
+  using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
+
+  using IndexSetType = 
+   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  IndexSetType iset;
+  std::vector<INDEX_TYPE> is_indices; 
+  buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
+    iset, is_indices, working_res);
+
+  //
+  // Working array length
+  //
+  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+
+  //
+  // Allocate and initialize arrays used in testing
+  //  
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
+
+  INDEX_TYPE ticount = 0;
+  for (size_t i = 0; i < is_indices.size(); ++i) {
+    test_array[ ticount++ ] = is_indices[i];
+  }
+
+  RAJA::Layout<1> layout(N);
+  RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >
+    work_view(working_array, layout);
+
+  RAJA::forall_Icount<EXEC_POLICY>(iset,
+    [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
+    work_view( icount ) = idx;
+  });
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
+
+  // 
+  for (INDEX_TYPE i = 0; i < N; i++) {
+    ASSERT_EQ(test_array[i], check_array[i]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+
+TYPED_TEST_SUITE_P(ForallIcountIndexSetViewTest);
+template <typename T>
+class ForallIcountIndexSetViewTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallIcountIndexSetViewTest, IndexSetForallIcountView)
+{
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ForallIcountIndexSetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallIcountIndexSetViewTest,
+                            IndexSetForallIcountView);
+
+#endif  // __TEST_FORALL_ICOUNT_INDEXSET_VIEW_HPP__
diff --git a/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp b/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
new file mode 100644
index 0000000000..b06fe75392
--- /dev/null
+++ b/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
@@ -0,0 +1,102 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_INDEXSET_VIEW_HPP__
+#define __TEST_FORALL_INDEXSET_VIEW_HPP__
+
+#include "RAJA_test-indexset-build.hpp"
+
+#include <cstdio>
+#include <algorithm>
+#include <vector>
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+void ForallIndexSetViewTestImpl()
+{
+
+  using RangeSegType       = RAJA::TypedRangeSegment<INDEX_TYPE>;
+  using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
+  using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
+
+  using IndexSetType = 
+   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  IndexSetType iset; 
+  std::vector<INDEX_TYPE> is_indices;
+  buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
+    iset, is_indices, working_res);
+
+  //
+  // Working array length
+  //
+  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+
+  //
+  // Allocate and initialize arrays used in testing
+  //  
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
+
+  for (size_t i = 0; i < is_indices.size(); ++i) {
+    test_array[ is_indices[i] ] = is_indices[i];
+  }
+
+  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >;
+
+  RAJA::Layout<1> layout(N);
+  view_type work_view(working_array, layout);
+
+  RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    working_array[idx] = idx;
+  });
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
+
+  // 
+  for (INDEX_TYPE i = 0; i < N; i++) {
+    ASSERT_EQ(test_array[i], check_array[i]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+
+TYPED_TEST_SUITE_P(ForallIndexSetViewTest);
+template <typename T>
+class ForallIndexSetViewTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallIndexSetViewTest, IndexSetForallView)
+{
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ForallIndexSetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetViewTest,
+                            IndexSetForallView);
+
+#endif  // __TEST_FORALL_INDEXSET_VIEW_HPP__
diff --git a/test/functional/forall/indexset/CMakeLists.txt b/test/functional/forall/indexset/CMakeLists.txt
new file mode 100644
index 0000000000..4ae5288fe5
--- /dev/null
+++ b/test/functional/forall/indexset/CMakeLists.txt
@@ -0,0 +1,31 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of indexset test types for generating test files.
+#
+set(INDEXSETTESTTYPES IndexSet IcountIndexSet)
+
+
+#
+# Generate tests for each enabled RAJA back-end.
+#
+# Note: FORALL_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( BACKEND ${FORALL_BACKENDS} )
+  foreach( INDEXSETTESTTYPE ${INDEXSETTESTTYPES} )
+    configure_file( test-forall-indexset.cpp.in
+                    test-forall-${INDEXSETTESTTYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-forall-${INDEXSETTESTTYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-${INDEXSETTESTTYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-forall-${INDEXSETTESTTYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( INDEXSETTESTTYPES )
diff --git a/test/functional/forall/indexset/test-forall-indexset.cpp.in b/test/functional/forall/indexset/test-forall-indexset.cpp.in
new file mode 100644
index 0000000000..7c5def9181
--- /dev/null
+++ b/test/functional/forall/indexset/test-forall-indexset.cpp.in
@@ -0,0 +1,42 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+
+#include "RAJA_test-forall-indexset-execpol.hpp"
+#include "RAJA_test-indexset-build.hpp"
+
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-forall-@INDEXSETTESTTYPE@.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@ForallIndexSetTypes =
+  Test< camp::cartesian_product<IdxTypeList,
+                                @BACKEND@ResourceList,
+                                @BACKEND@ForallIndexSetExecPols>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               Forall@INDEXSETTESTTYPE@Test,
+                               @BACKEND@ForallIndexSetTypes);
diff --git a/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp b/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
new file mode 100644
index 0000000000..4f6e23e28c
--- /dev/null
+++ b/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
@@ -0,0 +1,97 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_ICOUNT_INDEXSET_HPP__
+#define __TEST_FORALL_ICOUNT_INDEXSET_HPP__
+
+#include <cstdio>
+#include <algorithm>
+#include <vector>
+
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+void ForallIcountIndexSetTestImpl()
+{
+
+  using RangeSegType       = RAJA::TypedRangeSegment<INDEX_TYPE>;
+  using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
+  using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
+
+  using IndexSetType = 
+   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  IndexSetType iset; 
+  std::vector<INDEX_TYPE> is_indices; 
+  buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
+    iset, is_indices, working_res);
+
+  //
+  // Working array length
+  //
+  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+
+  //
+  // Allocate and initialize arrays used in testing
+  //  
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
+
+  INDEX_TYPE ticount = 0;
+  for (size_t i = 0; i < is_indices.size(); ++i) {
+    test_array[ ticount++ ] = is_indices[i];
+  }
+
+  RAJA::forall_Icount(EXEC_POLICY(), iset,
+    [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
+    working_array[icount] = idx;
+  });
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
+
+  for (INDEX_TYPE i = 0; i < N; i++) {
+    ASSERT_EQ(test_array[i], check_array[i]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+
+TYPED_TEST_SUITE_P(ForallIcountIndexSetTest);
+template <typename T>
+class ForallIcountIndexSetTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallIcountIndexSetTest, IndexSetForallIcount)
+{
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ForallIcountIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallIcountIndexSetTest,
+                            IndexSetForallIcount);
+
+#endif  // __TEST_FORALL_ICOUNT_INDEXSET_HPP__
diff --git a/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp b/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
new file mode 100644
index 0000000000..f58de538a4
--- /dev/null
+++ b/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
@@ -0,0 +1,95 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_INDEXSET_HPP__
+#define __TEST_FORALL_INDEXSET_HPP__
+
+#include <cstdio>
+#include <algorithm>
+#include <vector>
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+void ForallIndexSetTestImpl()
+{
+
+  using RangeSegType       = RAJA::TypedRangeSegment<INDEX_TYPE>;
+  using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
+  using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
+
+  using IndexSetType = 
+   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  IndexSetType iset; 
+  std::vector<INDEX_TYPE> is_indices; 
+  buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
+    iset, is_indices, working_res);
+
+  //
+  // Working array length
+  //
+  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+
+  //
+  // Allocate and initialize arrays used in testing
+  //  
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
+
+  for (size_t i = 0; i < is_indices.size(); ++i) {
+    test_array[ is_indices[i] ] = is_indices[i];
+  }
+
+  RAJA::forall(EXEC_POLICY(), iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    working_array[idx] = idx;
+  });
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
+
+  // 
+  for (INDEX_TYPE i = 0; i < N; i++) {
+    ASSERT_EQ(test_array[i], check_array[i]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+
+TYPED_TEST_SUITE_P(ForallIndexSetTest);
+template <typename T>
+class ForallIndexSetTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallIndexSetTest, IndexSetForall)
+{
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ForallIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetTest,
+                            IndexSetForall);
+
+#endif  // __TEST_FORALL_INDEXSET_HPP__
diff --git a/test/functional/forall/reduce-basic/CMakeLists.txt b/test/functional/forall/reduce-basic/CMakeLists.txt
new file mode 100644
index 0000000000..77aaec4752
--- /dev/null
+++ b/test/functional/forall/reduce-basic/CMakeLists.txt
@@ -0,0 +1,63 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of core reduction types for generating test files.
+#
+set(REDUCETYPES ReduceSum ReduceMin ReduceMax ReduceMinLoc ReduceMaxLoc)
+
+set(DATATYPES CoreReductionDataTypeList)
+
+#
+# Generate core reduction tests for each enabled RAJA back-end
+#
+# Note: FORALL_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( BACKEND ${FORALL_BACKENDS} )
+  foreach( REDUCETYPE ${REDUCETYPES} )
+    configure_file( test-forall-basic-reduce.cpp.in
+                    test-forall-basic-${REDUCETYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-forall-basic-${REDUCETYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-basic-${REDUCETYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-forall-basic-${REDUCETYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( DATATYPES )
+unset( REDUCETYPES )
+
+
+#
+# List of bitwise reduction types for generating test files.
+#
+set(REDUCETYPES ReduceBitAnd ReduceBitOr)
+
+set(DATATYPES BitwiseReductionDataTypeList)
+
+#
+# Generate bitwise reduction tests for each enabled RAJA back-end
+#
+# Note: FORALL_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( BACKEND ${FORALL_BACKENDS} )
+  foreach( REDUCETYPE ${REDUCETYPES} )
+    configure_file( test-forall-basic-reduce.cpp.in
+                    test-forall-basic-${REDUCETYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-forall-basic-${REDUCETYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-basic-${REDUCETYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-forall-basic-${REDUCETYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( DATATYPES )
+unset( REDUCETYPES )
+
+
diff --git a/test/functional/forall/reduce-basic/test-forall-basic-reduce.cpp.in b/test/functional/forall/reduce-basic/test-forall-basic-reduce.cpp.in
new file mode 100644
index 0000000000..c6d8b03cd7
--- /dev/null
+++ b/test/functional/forall/reduce-basic/test-forall-basic-reduce.cpp.in
@@ -0,0 +1,62 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-reducepol.hpp"
+
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-forall-basic-@REDUCETYPE@.hpp"
+
+//
+// Data types for core reduction basic tests
+//
+using CoreReductionDataTypeList = camp::list< int,
+                                              float,
+                                              double >;
+
+//
+// Data types for bitwise reduction basic tests
+//
+using BitwiseReductionDataTypeList = camp::list< int,
+                                                 unsigned int >;
+
+
+//
+// These tests exercise only one index type. We parameterize here to 
+// make it easier to expand types in the future if needed.
+//
+using TestIdxTypeList = camp::list< RAJA::Index_type >;
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@ForallReduceBasicTypes =
+  Test< camp::cartesian_product<TestIdxTypeList,
+                                @DATATYPES@,
+                                @BACKEND@ResourceList,
+                                @BACKEND@ForallReduceExecPols,
+                                @BACKEND@ReducePols>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               Forall@REDUCETYPE@BasicTest,
+                               @BACKEND@ForallReduceBasicTypes);
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
new file mode 100644
index 0000000000..38b5511560
--- /dev/null
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
@@ -0,0 +1,119 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_BASIC_REDUCEBITAND_HPP__
+#define __TEST_FORALL_BASIC_REDUCEBITAND_HPP__
+
+#include <cstdlib>
+#include <numeric>
+
+template <typename IDX_TYPE, typename DATA_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceBitAndBasicTestImpl(IDX_TYPE first, IDX_TYPE last)
+{
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(last,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  //
+  // First a simple non-trivial test that is mildly interesting
+  //
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    test_array[i] = 13;
+  }
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+
+  RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> simpand(21);
+
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    simpand &= working_array[idx];
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(simpand.get()), 5);
+
+  
+  // 
+  // And now a randomized test that pushes zeros around
+  // 
+
+  const int modval = 100;
+
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  }
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+
+  DATA_TYPE ref_and = 0;
+  for (IDX_TYPE i = first; i < last; ++i) {
+    ref_and &= test_array[i];
+  }
+
+  RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand(0);
+  RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand2(2);
+
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    redand  &= working_array[idx];
+    redand2 &= working_array[idx];
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
+  ASSERT_EQ(static_cast<DATA_TYPE>(redand2.get()), ref_and);
+
+  redand.reset(0);
+
+  const int nloops = 3;
+  for (int j = 0; j < nloops; ++j) {
+    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      redand &= working_array[idx];
+    });
+  }
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
+   
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+
+TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest);
+template <typename T>
+class ForallReduceBitAndBasicTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                  EXEC_POLICY, REDUCE_POLICY>(0, 28);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                  EXEC_POLICY, REDUCE_POLICY>(3, 642);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                  EXEC_POLICY, REDUCE_POLICY>(0, 2057);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest,
+                            ReduceBitAndBasicForall);
+
+#endif  // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
new file mode 100644
index 0000000000..61bcb9bbd1
--- /dev/null
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
@@ -0,0 +1,121 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
+#define __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
+
+#include <cstdlib>
+#include <numeric>
+
+template <typename IDX_TYPE, typename DATA_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceBitOrBasicTestImpl(IDX_TYPE first, IDX_TYPE last)
+{
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(last,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  //
+  // First a simple non-trivial test that is mildly interesting
+  //
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    test_array[i] = 9;
+  }
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+
+  RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> simpor(5);
+
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    simpor |= working_array[idx];
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(simpor.get()), 13);
+
+ 
+  //
+  // And now a randomized test that pushes zeros around
+  //
+
+  const int modval = 100;
+
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  }
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+
+  DATA_TYPE ref_or = 0;
+  for (IDX_TYPE i = first; i < last; ++i) {
+    ref_or |= test_array[i]; 
+  }
+
+
+  RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> redor(0);
+  RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> redor2(2);
+
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    redor  |= working_array[idx];
+    redor2 |= working_array[idx];
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(redor.get()), ref_or);
+  ASSERT_EQ(static_cast<DATA_TYPE>(redor2.get()), ref_or);
+
+  redor.reset(0);
+
+  const int nloops = 3;
+
+  for (int j = 0; j < nloops; ++j) {
+    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      redor |= working_array[idx];
+    });
+  }
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(redor.get()), ref_or);
+   
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest);
+template <typename T>
+class ForallReduceBitOrBasicTest : public ::testing::Test
+{
+};
+
+
+TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
+                                 EXEC_POLICY, REDUCE_POLICY>(0, 28);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                 EXEC_POLICY, REDUCE_POLICY>(3, 642);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                 EXEC_POLICY, REDUCE_POLICY>(0, 2057);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest,
+                            ReduceBitOrBasicForall);
+
+#endif  // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
new file mode 100644
index 0000000000..3d29e3f573
--- /dev/null
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
@@ -0,0 +1,104 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_BASIC_REDUCEMAX_HPP__
+#define __TEST_FORALL_BASIC_REDUCEMAX_HPP__
+
+#include <cstdlib>
+#include <numeric>
+
+template <typename IDX_TYPE, typename DATA_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceMaxBasicTestImpl(IDX_TYPE first, IDX_TYPE last)
+{
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(last,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const int modval = 100;
+  const DATA_TYPE max_init = -1;
+  const DATA_TYPE big_max = modval + 1;
+
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  }
+
+  DATA_TYPE ref_max = max_init;
+  for (IDX_TYPE i = first; i < last; ++i) {
+    ref_max = RAJA_MAX(test_array[i], ref_max); 
+  }
+
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+
+  RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> maxinit(big_max);
+  RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> max(max_init);
+
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    maxinit.max( working_array[idx] );
+    max.max( working_array[idx] );
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.get()), big_max);
+  ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max);
+
+  max.reset(max_init);
+  ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), max_init);
+
+  DATA_TYPE factor = 2;
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    max.max( working_array[idx] * factor);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
+   
+  factor = 3;
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    max.max( working_array[idx] * factor);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
+   
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest);
+template <typename T>
+class ForallReduceMaxBasicTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                EXEC_POLICY, REDUCE_POLICY>(0, 28);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                EXEC_POLICY, REDUCE_POLICY>(3, 642);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                EXEC_POLICY, REDUCE_POLICY>(0, 2057);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest,
+                            ReduceMaxBasicForall);
+
+#endif  // __TEST_FORALL_BASIC_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
new file mode 100644
index 0000000000..ea680f9f49
--- /dev/null
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
@@ -0,0 +1,119 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_BASIC_REDUCEMAXLOC_HPP__
+#define __TEST_FORALL_BASIC_REDUCEMAXLOC_HPP__
+
+#include <cstdlib>
+#include <numeric>
+#include <iostream>
+
+template <typename IDX_TYPE, typename DATA_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceMaxLocBasicTestImpl(IDX_TYPE first, IDX_TYPE last)
+{
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(last,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const int modval = 100;
+  const DATA_TYPE max_init = -modval;
+  const IDX_TYPE maxloc_init = -1;
+  const IDX_TYPE maxloc_idx = (last - first) * 2/3 + first;
+  const DATA_TYPE big_max = modval+1;
+  const IDX_TYPE big_maxloc = maxloc_init;
+
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  }
+  test_array[maxloc_idx] = static_cast<DATA_TYPE>(big_max);
+
+  DATA_TYPE ref_max = max_init;
+  IDX_TYPE ref_maxloc = maxloc_init;
+  for (IDX_TYPE i = first; i < last; ++i) {
+    if ( test_array[i] > ref_max ) {
+       ref_max = test_array[i];
+       ref_maxloc = i;
+    } 
+  }
+
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+
+
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> maxinit(big_max, maxloc_init);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max(max_init, maxloc_init);
+
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    maxinit.maxloc( working_array[idx], idx );
+    max.maxloc( working_array[idx], idx );
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.get()), big_max);
+  ASSERT_EQ(static_cast<IDX_TYPE>(maxinit.getLoc()), big_maxloc);
+  ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max);
+  ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
+
+  max.reset(max_init, maxloc_init);
+  ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), max_init);
+  ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), maxloc_init);
+
+  DATA_TYPE factor = 2;
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    max.maxloc( working_array[idx] * factor, idx);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
+  ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
+  
+  factor = 3;
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { 
+    max.maxloc( working_array[idx] * factor, idx);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
+  ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
+ 
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest);
+template <typename T>
+class ForallReduceMaxLocBasicTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                  EXEC_POLICY, REDUCE_POLICY>(0, 28);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                  EXEC_POLICY, REDUCE_POLICY>(3, 642);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                  EXEC_POLICY, REDUCE_POLICY>(0, 2057);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest,
+                            ReduceMaxLocBasicForall);
+
+#endif  // __TEST_FORALL_BASIC_REDUCEMAXLOC_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
new file mode 100644
index 0000000000..50109eaf7b
--- /dev/null
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
@@ -0,0 +1,106 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_BASIC_REDUCEMIN_HPP__
+#define __TEST_FORALL_BASIC_REDUCEMIN_HPP__
+
+#include <cstdlib>
+#include <numeric>
+
+template <typename IDX_TYPE, typename DATA_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceMinBasicTestImpl(IDX_TYPE first, IDX_TYPE last)
+{
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(last,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const int modval = 100;
+  const DATA_TYPE min_init = modval+1;
+  const DATA_TYPE small_min = -modval;
+
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  }
+
+  DATA_TYPE ref_min = min_init;
+  for (IDX_TYPE i = first; i < last; ++i) {
+    ref_min = RAJA_MIN(test_array[i], ref_min); 
+  }
+
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+
+
+  RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> mininit(small_min);
+  RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> min(min_init);
+
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    mininit.min( working_array[idx] );
+    min.min( working_array[idx] );
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
+  ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min);
+
+  min.reset(min_init);
+  ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), min_init);
+
+  DATA_TYPE factor = 3; 
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    min.min( working_array[idx] * factor);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
+
+  factor = 2;
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { 
+    min.min( working_array[idx] * factor);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
+   
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+
+TYPED_TEST_SUITE_P(ForallReduceMinBasicTest);
+template <typename T>
+class ForallReduceMinBasicTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                               EXEC_POLICY, REDUCE_POLICY>(0, 28);
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                               EXEC_POLICY, REDUCE_POLICY>(3, 642);
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                               EXEC_POLICY, REDUCE_POLICY>(0, 2057);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest,
+                            ReduceMinBasicForall);
+
+#endif  // __TEST_FORALL_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
new file mode 100644
index 0000000000..92d74bbd01
--- /dev/null
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
@@ -0,0 +1,119 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_BASIC_REDUCEMINLOC_HPP__
+#define __TEST_FORALL_BASIC_REDUCEMINLOC_HPP__
+
+#include <cstdlib>
+#include <numeric>
+#include <iostream>
+
+template <typename IDX_TYPE, typename DATA_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceMinLocBasicTestImpl(IDX_TYPE first, IDX_TYPE last)
+{
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(last,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const int modval = 100;
+  const DATA_TYPE min_init = modval+1;
+  const IDX_TYPE minloc_init = -1;
+  const IDX_TYPE minloc_idx = (last - first) * 2/3 + first;
+  const DATA_TYPE small_min = -modval;
+  const IDX_TYPE small_minloc = minloc_init;
+
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  }
+  test_array[minloc_idx] = static_cast<DATA_TYPE>(small_min);
+
+  DATA_TYPE ref_min = min_init;
+  IDX_TYPE ref_minloc = minloc_init;
+  for (IDX_TYPE i = first; i < last; ++i) {
+    if ( test_array[i] < ref_min ) {
+       ref_min = test_array[i];
+       ref_minloc = i;
+    } 
+  }
+
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+
+
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> mininit(small_min, minloc_init);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min(min_init, minloc_init);
+
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    mininit.minloc( working_array[idx], idx );
+    min.minloc( working_array[idx], idx );
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
+  ASSERT_EQ(static_cast<IDX_TYPE>(mininit.getLoc()), small_minloc);
+  ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min);
+  ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
+
+  min.reset(min_init, minloc_init);
+  ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), min_init);
+  ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), minloc_init);
+
+  DATA_TYPE factor = 2;
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    min.minloc( working_array[idx] * factor, idx);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
+  ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
+
+  factor = 3;
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { 
+    min.minloc( working_array[idx] * factor, idx);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
+  ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
+   
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest);
+template <typename T>
+class ForallReduceMinLocBasicTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                  EXEC_POLICY, REDUCE_POLICY>(0, 28);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                  EXEC_POLICY, REDUCE_POLICY>(3, 642);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                  EXEC_POLICY, REDUCE_POLICY>(0, 2057);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest,
+                            ReduceMinLocBasicForall);
+
+#endif  // __TEST_FORALL_BASIC_REDUCEMINLOC_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
new file mode 100644
index 0000000000..74d5f0b906
--- /dev/null
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
@@ -0,0 +1,101 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_BASIC_REDUCESUM_HPP__
+#define __TEST_FORALL_BASIC_REDUCESUM_HPP__
+
+#include <cstdlib>
+#include <numeric>
+
+template <typename IDX_TYPE, typename DATA_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceSumBasicTestImpl(IDX_TYPE first, IDX_TYPE last)
+{
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(last,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const int modval = 100;
+
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  }
+
+  DATA_TYPE ref_sum = 0;
+  for (IDX_TYPE i = first; i < last; ++i) {
+    ref_sum += test_array[i]; 
+  }
+
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+
+
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum(0);
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum2(2);
+
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    sum  += working_array[idx];
+    sum2 += working_array[idx];
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), ref_sum);
+  ASSERT_EQ(static_cast<DATA_TYPE>(sum2.get()), ref_sum + 2);
+
+  sum.reset(0);
+
+  const int nloops = 2;
+
+  for (int j = 0; j < nloops; ++j) {
+    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      sum += working_array[idx];
+    });
+  }
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), nloops * ref_sum);
+   
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+
+TYPED_TEST_SUITE_P(ForallReduceSumBasicTest);
+template <typename T>
+class ForallReduceSumBasicTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                               EXEC_POLICY, REDUCE_POLICY>(0, 28);
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                               EXEC_POLICY, REDUCE_POLICY>(3, 642);
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                               EXEC_POLICY, REDUCE_POLICY>(0, 2057);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest,
+                            ReduceSumBasicForall);
+
+#endif  // __TEST_FORALL_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/CMakeLists.txt b/test/functional/forall/reduce-multiple-indexset/CMakeLists.txt
new file mode 100644
index 0000000000..f33ec93b0b
--- /dev/null
+++ b/test/functional/forall/reduce-multiple-indexset/CMakeLists.txt
@@ -0,0 +1,31 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of reduction types for generating test files.
+#
+set(REDUCETYPES ReduceSum ReduceMin ReduceMax ReduceMinLoc ReduceMaxLoc)
+
+
+#
+# Generate tests for each enabled RAJA back-end
+#
+# Note: FORALL_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( BACKEND ${FORALL_BACKENDS} )
+  foreach( REDUCETYPE ${REDUCETYPES} )
+    configure_file( test-forall-indexset-multiple-reduce.cpp.in
+                    test-forall-indexset-multiple-${REDUCETYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-forall-indexset-multiple-${REDUCETYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-indexset-multiple-${REDUCETYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-forall-indexset-multiple-${REDUCETYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( REDUCETYPES )
diff --git a/test/functional/forall/reduce-multiple-indexset/test-forall-indexset-multiple-reduce.cpp.in b/test/functional/forall/reduce-multiple-indexset/test-forall-indexset-multiple-reduce.cpp.in
new file mode 100644
index 0000000000..7bd66669a5
--- /dev/null
+++ b/test/functional/forall/reduce-multiple-indexset/test-forall-indexset-multiple-reduce.cpp.in
@@ -0,0 +1,48 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+
+#include "RAJA_test-forall-indexset-execpol.hpp"
+#include "RAJA_test-reducepol.hpp"
+
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-forall-indexset-multiple-@REDUCETYPE@.hpp"
+
+//
+// These tests exercise only one index type. We parameterize here to
+// make it easier to expand types in the future if needed.
+//
+using TestIdxTypeList = camp::list< RAJA::Index_type >;
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@ForallIndexSetReduceMultipleTypes =
+  Test< camp::cartesian_product<TestIdxTypeList,
+                                @BACKEND@ResourceList,
+                                @BACKEND@ForallIndexSetReduceExecPols,
+                                @BACKEND@ReducePols>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               ForallIndexSet@REDUCETYPE@MultipleTest,
+                               @BACKEND@ForallIndexSetReduceMultipleTypes);
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
new file mode 100644
index 0000000000..875c70bcf6
--- /dev/null
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
@@ -0,0 +1,124 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMAX_HPP__
+#define __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMAX_HPP__
+
+#include <cfloat>
+#include <cstdlib>
+#include <iostream>
+#include <random>
+
+//
+// Test runs 2 reductions (double) over disjoint chunks
+// of an array using an indexset with four range segments
+// not aligned with warp boundaries, for example, to check that reduction
+// mechanics don't depend on any sort of special indexing.
+//
+template <typename IDX_TYPE, typename WORKING_RES,
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallIndexSetReduceMaxMultipleTestImpl()
+{
+  using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
+  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(4098, 6103);
+  RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
+
+  IdxSetType iset;
+  iset.push_back(r1);
+  iset.push_back(r2);
+  iset.push_back(r3);
+  iset.push_back(r4);
+
+  const IDX_TYPE alen = 15286;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  double* working_array;
+  double* check_array;
+  double* test_array;
+
+  allocateForallTestData<double>(alen,
+                                 working_res,
+                                 &working_array,
+                                 &check_array,
+                                 &test_array);
+
+  const double default_val = -DBL_MAX;
+
+  for (IDX_TYPE i = 0; i < alen; ++i) {
+    test_array[i] = default_val;
+  }
+
+  // for setting random values in arrays
+  std::random_device rd;
+  std::mt19937 mt(rd());
+  std::uniform_real_distribution<double> dist(-10, 10);
+
+  double current_max = default_val;
+  const int test_repeat = 4;
+
+  RAJA::ReduceMax<REDUCE_POLICY, double> dmax0(default_val);
+  RAJA::ReduceMax<REDUCE_POLICY, double> dmax1(default_val);
+
+  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
+
+     // pick an index in one of the segments
+     int index = 5127;  // seg 3
+     if (tcount == 2) index = 1938; // seg2
+     if (tcount == 3) index = 13333; // seg4
+     if (tcount == 4) index = 52; // seg1
+
+     double droll = dist(mt);
+     if (test_array[index] > droll) {
+       test_array[index] = droll;
+       current_max = RAJA_MAX(current_max, droll);
+     }
+
+     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+
+     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+       dmax0.max(working_array[i]);
+       dmax1.max(2 * working_array[i]);
+     });
+
+     ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
+     ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
+
+  }
+
+  deallocateForallTestData<double>(working_res,
+                                   working_array,
+                                   check_array,
+                                   test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxMultipleTest);
+template <typename T>
+class ForallIndexSetReduceMaxMultipleTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallIndexSetReduceMaxMultipleTest,
+             ReduceMaxMultipleForallIndexSet)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+
+  ForallIndexSetReduceMaxMultipleTestImpl<IDX_TYPE, WORKING_RES,
+                                          EXEC_POLICY, REDUCE_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxMultipleTest,
+                            ReduceMaxMultipleForallIndexSet);
+
+#endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
new file mode 100644
index 0000000000..c83e89a434
--- /dev/null
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
@@ -0,0 +1,120 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMAXLOC_HPP__
+#define __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMAXLOC_HPP__
+
+#include <cfloat>
+#include <cstdlib>
+#include <iostream>
+#include <random>
+
+//
+// Test runs 2 reductions (double) over disjoint chunks
+// of an array using an indexset with four range segments
+// not aligned with warp boundaries, for example, to check that reduction
+// mechanics don't depend on any sort of special indexing.
+//
+template <typename IDX_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallIndexSetReduceMaxLocMultipleTestImpl()
+{
+  using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
+  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(4098, 6103);
+  RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
+
+  IdxSetType iset;
+  iset.push_back(r1); 
+  iset.push_back(r2); 
+  iset.push_back(r3); 
+  iset.push_back(r4); 
+
+  const IDX_TYPE alen = 15286;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  double* working_array;
+  double* check_array;
+  double* test_array;
+
+  allocateForallTestData<double>(alen,
+                                 working_res,
+                                 &working_array,
+                                 &check_array,
+                                 &test_array);
+
+  double current_max = -DBL_MAX;
+  IDX_TYPE current_loc = -1;
+
+  for (IDX_TYPE i = 0; i < alen; ++i) {
+    test_array[i] = current_max;
+  }
+  
+  const int test_repeat = 4;
+
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax0(current_max, current_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax1(current_max, current_loc);
+
+  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
+
+     // set max val 
+     current_max = 100.0 + tcount * 10.0;
+
+     // pick an index in one of the segments
+     current_loc = 5127;  // seg 3
+     if (tcount == 2) current_loc = 1938; // seg2
+     if (tcount == 3) current_loc = 13333; // seg4
+     if (tcount == 4) current_loc = 52; // seg1
+
+     test_array[current_loc] = current_max;
+ 
+     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+
+     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+       dmax0.maxloc(working_array[i], i);
+       dmax1.maxloc(2 * working_array[i], i);
+     });
+
+     ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
+     ASSERT_EQ(static_cast<IDX_TYPE>(dmax0.getLoc()), current_loc);
+     ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
+     ASSERT_EQ(static_cast<IDX_TYPE>(dmax1.getLoc()), current_loc);
+
+  }
+
+  deallocateForallTestData<double>(working_res,
+                                   working_array,
+                                   check_array,
+                                   test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxLocMultipleTest);
+template <typename T>
+class ForallIndexSetReduceMaxLocMultipleTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallIndexSetReduceMaxLocMultipleTest, 
+             ReduceMaxLocMultipleForallIndexSet)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+
+  ForallIndexSetReduceMaxLocMultipleTestImpl<IDX_TYPE, WORKING_RES,
+                                             EXEC_POLICY, REDUCE_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxLocMultipleTest,
+                            ReduceMaxLocMultipleForallIndexSet);
+
+#endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMAXLOC_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
new file mode 100644
index 0000000000..0823fd5354
--- /dev/null
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
@@ -0,0 +1,124 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMIN_HPP__
+#define __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMIN_HPP__
+
+#include <cfloat>
+#include <cstdlib>
+#include <iostream>
+#include <random>
+
+//
+// Test runs 2 reductions (double) over disjoint chunks
+// of an array using an indexset with four range segments
+// not aligned with warp boundaries, for example, to check that reduction
+// mechanics don't depend on any sort of special indexing.
+//
+template <typename IDX_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallIndexSetReduceMinMultipleTestImpl()
+{
+  using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
+  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(4098, 6103);
+  RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
+
+  IdxSetType iset;
+  iset.push_back(r1); 
+  iset.push_back(r2); 
+  iset.push_back(r3); 
+  iset.push_back(r4); 
+
+  const IDX_TYPE alen = 15286;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  double* working_array;
+  double* check_array;
+  double* test_array;
+
+  allocateForallTestData<double>(alen,
+                                 working_res,
+                                 &working_array,
+                                 &check_array,
+                                 &test_array);
+
+  const double default_val = DBL_MAX;
+
+  for (IDX_TYPE i = 0; i < alen; ++i) {
+    test_array[i] = default_val;
+  }
+  
+  // for setting random values in arrays
+  std::random_device rd;
+  std::mt19937 mt(rd());
+  std::uniform_real_distribution<double> dist(-10, 10);
+
+  double current_min = default_val;
+  const int test_repeat = 4;
+
+  RAJA::ReduceMin<REDUCE_POLICY, double> dmin0(default_val);
+  RAJA::ReduceMin<REDUCE_POLICY, double> dmin1(default_val);
+
+  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
+
+     // pick an index in one of the segments
+     int index = 5127;  // seg 3
+     if (tcount == 2) index = 1938; // seg2
+     if (tcount == 3) index = 13333; // seg4
+     if (tcount == 4) index = 52; // seg1
+
+     double droll = dist(mt);
+     if (test_array[index] > droll) {
+       test_array[index] = droll;
+       current_min = RAJA_MIN(current_min, droll);
+     }
+ 
+     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+
+     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+       dmin0.min(working_array[i]);
+       dmin1.min(2 * working_array[i]);
+     });
+
+     ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
+     ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
+
+  }
+
+  deallocateForallTestData<double>(working_res,
+                                   working_array,
+                                   check_array,
+                                   test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallIndexSetReduceMinMultipleTest);
+template <typename T>
+class ForallIndexSetReduceMinMultipleTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallIndexSetReduceMinMultipleTest, 
+             ReduceMinMultipleForallIndexSet)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+
+  ForallIndexSetReduceMinMultipleTestImpl<IDX_TYPE, WORKING_RES,
+                                          EXEC_POLICY, REDUCE_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMinMultipleTest,
+                            ReduceMinMultipleForallIndexSet);
+
+#endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
new file mode 100644
index 0000000000..28eac953ce
--- /dev/null
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
@@ -0,0 +1,120 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMINLOC_HPP__
+#define __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMINLOC_HPP__
+
+#include <cfloat>
+#include <cstdlib>
+#include <iostream>
+#include <random>
+
+//
+// Test runs 2 reductions (double) over disjoint chunks
+// of an array using an indexset with four range segments
+// not aligned with warp boundaries, for example, to check that reduction
+// mechanics don't depend on any sort of special indexing.
+//
+template <typename IDX_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallIndexSetReduceMinLocMultipleTestImpl()
+{
+  using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
+  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(4098, 6103);
+  RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
+
+  IdxSetType iset;
+  iset.push_back(r1); 
+  iset.push_back(r2); 
+  iset.push_back(r3); 
+  iset.push_back(r4); 
+
+  const IDX_TYPE alen = 15286;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  double* working_array;
+  double* check_array;
+  double* test_array;
+
+  allocateForallTestData<double>(alen,
+                                 working_res,
+                                 &working_array,
+                                 &check_array,
+                                 &test_array);
+
+  double current_min = DBL_MAX;
+  IDX_TYPE current_loc = -1;
+
+  for (IDX_TYPE i = 0; i < alen; ++i) {
+    test_array[i] = current_min;
+  }
+  
+  const int test_repeat = 4;
+
+  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin0(current_min, current_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin1(current_min, current_loc);
+
+  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
+
+     // set min val 
+     current_min = 100.0 - tcount * 10.0;
+
+     // pick an index in one of the segments
+     current_loc = 5127;  // seg 3
+     if (tcount == 2) current_loc = 1938; // seg2
+     if (tcount == 3) current_loc = 13333; // seg4
+     if (tcount == 4) current_loc = 52; // seg1
+
+     test_array[current_loc] = current_min;
+ 
+     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+
+     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+       dmin0.minloc(working_array[i], i);
+       dmin1.minloc(2 * working_array[i], i);
+     });
+
+     ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
+     ASSERT_EQ(static_cast<IDX_TYPE>(dmin0.getLoc()), current_loc);
+     ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
+     ASSERT_EQ(static_cast<IDX_TYPE>(dmin1.getLoc()), current_loc);
+
+  }
+
+  deallocateForallTestData<double>(working_res,
+                                   working_array,
+                                   check_array,
+                                   test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallIndexSetReduceMinLocMultipleTest);
+template <typename T>
+class ForallIndexSetReduceMinLocMultipleTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallIndexSetReduceMinLocMultipleTest, 
+             ReduceMinLocMultipleForallIndexSet)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+
+  ForallIndexSetReduceMinLocMultipleTestImpl<IDX_TYPE, WORKING_RES,
+                                             EXEC_POLICY, REDUCE_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMinLocMultipleTest,
+                            ReduceMinLocMultipleForallIndexSet);
+
+#endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMINLOC_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
new file mode 100644
index 0000000000..36d6bceffd
--- /dev/null
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
@@ -0,0 +1,137 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_INDEXSET_MULTIPLE_REDUCESUM_HPP__
+#define __TEST_FORALL_INDEXSET_MULTIPLE_REDUCESUM_HPP__
+
+#include <cstdlib>
+#include <numeric>
+
+//
+// Test runs 4 reductions (2 int, 2 double) over disjoint chunks
+// of an array using an indexset with four range segments
+// not aligned with warp boundaries, for example, to check that reduction
+// mechanics don't depend on any sort of special indexing.
+//
+template <typename IDX_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallIndexSetReduceSumMultipleTestImpl()
+{
+  using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
+  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(4098, 6103);
+  RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
+
+  IdxSetType iset;
+  iset.push_back(r1); 
+  iset.push_back(r2); 
+  iset.push_back(r3); 
+  iset.push_back(r4); 
+
+  const IDX_TYPE alen = 15286;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  double* dworking_array;
+  double* dcheck_array;
+  double* dtest_array;
+
+  allocateForallTestData<double>(alen,
+                                 working_res,
+                                 &dworking_array,
+                                 &dcheck_array,
+                                 &dtest_array);
+
+  int* iworking_array;
+  int* icheck_array;
+  int* itest_array;
+
+  allocateForallTestData<int>(alen,
+                              working_res,
+                              &iworking_array,
+                              &icheck_array,
+                              &itest_array);
+
+  const double dinit_val = 0.1;
+  const int iinit_val = 1;
+
+  for (IDX_TYPE i = 0; i < alen; ++i) {
+    dtest_array[i] = dinit_val;
+    itest_array[i] = iinit_val;
+  }
+  
+  working_res.memcpy(dworking_array, dtest_array, sizeof(double) * alen);
+  working_res.memcpy(iworking_array, itest_array, sizeof(int) * alen);
+
+  const double drinit = 5.0;
+  const int irinit = 4;
+  const int test_repeat = 4;
+
+  RAJA::ReduceSum<REDUCE_POLICY, double> dsum0(drinit * 1.0);
+  RAJA::ReduceSum<REDUCE_POLICY, int> isum1(irinit * 2);
+  RAJA::ReduceSum<REDUCE_POLICY, double> dsum2(drinit * 3.0);
+  RAJA::ReduceSum<REDUCE_POLICY, int> isum3(irinit * 4);
+
+  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
+ 
+    RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      dsum0 += 1.0 * dworking_array[idx];
+      isum1 += 2 * iworking_array[idx];
+      dsum2 += 3.0 * dworking_array[idx];
+      isum3 += 4 * iworking_array[idx];
+    });
+
+    double dchk_val = dinit_val * static_cast<double>(iset.getLength());
+    int ichk_val = iinit_val * static_cast<int>(iset.getLength());
+
+    ASSERT_FLOAT_EQ(static_cast<double>(dsum0.get()), 
+                               tcount * (1 * dchk_val) + (drinit * 1.0) );
+    ASSERT_EQ(static_cast<int>(isum1.get()), 
+                               tcount * (2 * ichk_val) + (irinit * 2) );
+    ASSERT_FLOAT_EQ(static_cast<double>(dsum2.get()),
+                               tcount * (3 * dchk_val) + (drinit * 3.0) );
+    ASSERT_EQ(static_cast<int>(isum3.get()), 
+                               tcount * (4 * ichk_val) + (irinit * 4) );
+
+  }
+
+  deallocateForallTestData<double>(working_res,
+                                   dworking_array,
+                                   dcheck_array,
+                                   dtest_array);
+
+  deallocateForallTestData<int>(working_res,
+                                iworking_array,
+                                icheck_array,
+                                itest_array);
+}
+
+TYPED_TEST_SUITE_P(ForallIndexSetReduceSumMultipleTest);
+template <typename T>
+class ForallIndexSetReduceSumMultipleTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallIndexSetReduceSumMultipleTest, 
+             ReduceSumMultipleForallIndexSet)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+
+  ForallIndexSetReduceSumMultipleTestImpl<IDX_TYPE, WORKING_RES,
+                                          EXEC_POLICY, REDUCE_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceSumMultipleTest,
+                            ReduceSumMultipleForallIndexSet);
+
+#endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/CMakeLists.txt b/test/functional/forall/reduce-multiple-segment/CMakeLists.txt
new file mode 100644
index 0000000000..9c97b463b4
--- /dev/null
+++ b/test/functional/forall/reduce-multiple-segment/CMakeLists.txt
@@ -0,0 +1,31 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of reduction types for generating test files.
+#
+set(REDUCETYPES ReduceSum ReduceMin ReduceMax ReduceMinLoc ReduceMaxLoc)
+
+
+#
+# Generate tests for each enabled RAJA back-end
+#
+# Note: FORALL_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( BACKEND ${FORALL_BACKENDS} )
+  foreach( REDUCETYPE ${REDUCETYPES} )
+    configure_file( test-forall-segment-multiple-reduce.cpp.in
+                    test-forall-segment-multiple-${REDUCETYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-forall-segment-multiple-${REDUCETYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-segment-multiple-${REDUCETYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-forall-segment-multiple-${REDUCETYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( REDUCETYPES )
diff --git a/test/functional/forall/reduce-multiple-segment/test-forall-segment-multiple-reduce.cpp.in b/test/functional/forall/reduce-multiple-segment/test-forall-segment-multiple-reduce.cpp.in
new file mode 100644
index 0000000000..dbf7d93952
--- /dev/null
+++ b/test/functional/forall/reduce-multiple-segment/test-forall-segment-multiple-reduce.cpp.in
@@ -0,0 +1,55 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-reducepol.hpp"
+
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-forall-segment-multiple-@REDUCETYPE@.hpp"
+
+//
+// Data types for reduction basic tests
+//
+using ReductionMultipleDataTypeList = camp::list< int,
+                                                  float,
+                                                  double >;
+
+//
+// These tests exercise only one index type. We parameterize here to
+// make it easier to expand types in the future if needed.
+//
+using TestIdxTypeList = camp::list< RAJA::Index_type >;
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@ForallReduceMultipleTypes =
+  Test< camp::cartesian_product<TestIdxTypeList,
+                                ReductionMultipleDataTypeList,
+                                @BACKEND@ResourceList,
+                                @BACKEND@ForallReduceExecPols,
+                                @BACKEND@ReducePols>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               Forall@REDUCETYPE@MultipleTest,
+                               @BACKEND@ForallReduceMultipleTypes);
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
new file mode 100644
index 0000000000..4ccf987e4a
--- /dev/null
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
@@ -0,0 +1,139 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_MULTIPLE_REDUCEMAX_HPP__
+#define __TEST_FORALL_MULTIPLE_REDUCEMAX_HPP__
+
+#include <cfloat>
+#include <climits>
+#include <cstdlib>
+#include <numeric>
+#include <random>
+
+template <typename IDX_TYPE, 
+          typename DATA_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, 
+                                     IDX_TYPE last)
+{
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
+
+  IDX_TYPE index_len = last - first;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(last,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const DATA_TYPE default_val = static_cast<DATA_TYPE>(-SHRT_MAX);
+  const DATA_TYPE big_val = 500;
+
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    test_array[i] = default_val;
+  }
+
+  
+  static std::random_device rd;
+  static std::mt19937 mt(rd());
+  static std::uniform_real_distribution<double> dist(-100, 100);
+  static std::uniform_real_distribution<double> dist2(0, index_len - 1);
+
+  DATA_TYPE current_max = default_val;
+
+  RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> max0;
+  max0.reset(default_val);
+  RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> max1(default_val);
+  RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> max2(big_val);
+
+  const int nloops = 8;
+  for (int j = 0; j < nloops; ++j) {
+
+    DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
+    IDX_TYPE max_index = static_cast<IDX_TYPE>(dist2(mt));
+
+    if ( test_array[max_index] < roll ) {
+      test_array[max_index] = roll;
+      current_max = RAJA_MAX( current_max, roll );
+
+      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+    }
+
+    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      max0.max(working_array[idx]);
+      max1.max(2 * working_array[idx]);
+      max2.max(working_array[idx]);
+    });
+
+    ASSERT_EQ(current_max, static_cast<DATA_TYPE>(max0.get()));
+    ASSERT_EQ(current_max * 2, static_cast<DATA_TYPE>(max1.get()));
+    ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
+
+  }
+
+  max0.reset(default_val); 
+  max1.reset(default_val); 
+  max2.reset(big_val); 
+
+  const int nloops_b = 4;
+  for (int j = 0; j < nloops_b; ++j) {
+
+    DATA_TYPE roll = dist(mt);
+    IDX_TYPE max_index = static_cast<IDX_TYPE>(dist2(mt));
+
+    if ( test_array[max_index] < roll ) {
+      test_array[max_index] = roll;
+      current_max = RAJA_MAX(current_max, roll );
+
+      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+    }
+
+    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      max0.max(working_array[idx]);
+      max1.max(2 * working_array[idx]);
+      max2.max(working_array[idx]);    
+    });
+
+    ASSERT_EQ(current_max, static_cast<DATA_TYPE>(max0.get()));
+    ASSERT_EQ(current_max * 2, static_cast<DATA_TYPE>(max1.get()));
+    ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
+
+  }
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallReduceMaxMultipleTest);
+template <typename T>
+class ForallReduceMaxMultipleTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallReduceMaxMultipleTest, ReduceMaxMultipleForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  ForallReduceMaxMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
+                                  EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxMultipleTest,
+                            ReduceMaxMultipleForall);
+
+#endif  // __TEST_FORALL_MULTIPLE_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
new file mode 100644
index 0000000000..ff523ce639
--- /dev/null
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
@@ -0,0 +1,154 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_MULTIPLE_REDUCEMAXLOC_HPP__
+#define __TEST_FORALL_MULTIPLE_REDUCEMAXLOC_HPP__
+
+#include <cfloat>
+#include <climits>
+#include <cstdlib>
+#include <numeric>
+#include <random>
+
+template <typename IDX_TYPE, 
+          typename DATA_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, 
+                                        IDX_TYPE last)
+{
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
+
+  IDX_TYPE index_len = last - first;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(last,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const DATA_TYPE default_val = static_cast<DATA_TYPE>(-SHRT_MAX);
+  const IDX_TYPE default_loc = -1;
+  const DATA_TYPE big_val = 500;
+
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    test_array[i] = default_val;
+  }
+
+  
+  static std::random_device rd;
+  static std::mt19937 mt(rd());
+  static std::uniform_real_distribution<double> dist(-100, 100);
+  static std::uniform_real_distribution<double> dist2(0, index_len - 1);
+
+  DATA_TYPE current_max        = default_val;
+  IDX_TYPE current_loc = default_loc;
+
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max0(default_val, default_loc);;
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max1(default_val, default_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max2(big_val, default_loc);
+
+  const int nloops = 8;
+  for (int j = 0; j < nloops; ++j) {
+
+    DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
+    IDX_TYPE max_index = static_cast<IDX_TYPE>(dist2(mt));
+
+    if ( current_max < roll && test_array[max_index] < roll ) {
+      test_array[max_index] = roll;
+      current_max = RAJA_MAX( current_max, roll );
+      current_loc = max_index;
+
+      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+    }
+
+    working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+
+    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      max0.maxloc(working_array[idx], idx);
+      max1.maxloc(2 * working_array[idx], idx);
+      max2.maxloc(working_array[idx], idx);
+    });
+
+    ASSERT_EQ(current_max, static_cast<DATA_TYPE>(max0.get()));
+    ASSERT_EQ(current_loc, static_cast<IDX_TYPE>(max0.getLoc()));
+
+    ASSERT_EQ(current_max * 2, static_cast<DATA_TYPE>(max1.get()));
+    ASSERT_EQ(current_loc, static_cast<IDX_TYPE>(max1.getLoc()));
+
+    ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
+    ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max2.getLoc()));
+
+  }
+
+  max0.reset(default_val, default_loc); 
+  max1.reset(default_val, default_loc); 
+  max2.reset(big_val, default_loc); 
+
+  const int nloops_b = 4;
+  for (int j = 0; j < nloops_b; ++j) {
+
+    DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
+    IDX_TYPE max_index = static_cast<IDX_TYPE>(dist2(mt));
+
+    if ( current_max < roll && test_array[max_index] < roll ) {
+      test_array[max_index] = roll;
+      current_max = RAJA_MAX( current_max, roll );
+      current_loc = max_index;
+
+      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+    }
+
+    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      max0.maxloc(working_array[idx], idx);
+      max1.maxloc(2 * working_array[idx], idx);
+      max2.maxloc(working_array[idx], idx);    
+    });
+
+    ASSERT_EQ(current_max, static_cast<DATA_TYPE>(max0.get()));
+    ASSERT_EQ(current_loc, static_cast<IDX_TYPE>(max0.getLoc()));
+
+    ASSERT_EQ(current_max * 2, static_cast<DATA_TYPE>(max1.get()));
+    ASSERT_EQ(current_loc, static_cast<IDX_TYPE>(max1.getLoc()));
+
+    ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
+    ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max2.getLoc()));
+
+  }
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallReduceMaxLocMultipleTest);
+template <typename T>
+class ForallReduceMaxLocMultipleTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallReduceMaxLocMultipleTest, ReduceMaxLocMultipleForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  ForallReduceMaxLocMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
+                                     EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocMultipleTest,
+                            ReduceMaxLocMultipleForall);
+
+#endif  // __TEST_FORALL_MULTIPLE_REDUCEMAXLOC_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
new file mode 100644
index 0000000000..0e40310251
--- /dev/null
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
@@ -0,0 +1,139 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_MULTIPLE_REDUCEMIN_HPP__
+#define __TEST_FORALL_MULTIPLE_REDUCEMIN_HPP__
+
+#include <cfloat>
+#include <climits>
+#include <cstdlib>
+#include <numeric>
+#include <random>
+
+template <typename IDX_TYPE,
+          typename DATA_TYPE, typename WORKING_RES,
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceMinMultipleTestImpl(IDX_TYPE first, 
+                                     IDX_TYPE last)
+{
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
+
+  IDX_TYPE index_len = last - first;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(last,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const DATA_TYPE default_val = static_cast<DATA_TYPE>(SHRT_MAX);
+  const DATA_TYPE big_val = -500;
+
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    test_array[i] = default_val;
+  }
+
+  
+  static std::random_device rd;
+  static std::mt19937 mt(rd());
+  static std::uniform_real_distribution<double> dist(-100, 100);
+  static std::uniform_real_distribution<double> dist2(0, index_len - 1);
+
+  DATA_TYPE current_min = default_val;
+
+  RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> min0;
+  min0.reset(default_val);
+  RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> min1(default_val);
+  RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> min2(big_val);
+
+  const int nloops = 8;
+  for (int j = 0; j < nloops; ++j) {
+
+    DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
+    IDX_TYPE min_index = static_cast<IDX_TYPE>(dist2(mt));
+
+    if ( test_array[min_index] > roll ) {
+      test_array[min_index] = roll;
+      current_min = RAJA_MIN( current_min, roll );
+
+      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+    }
+
+    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      min0.min(working_array[idx]);
+      min1.min(2 * working_array[idx]);
+      min2.min(working_array[idx]);
+    });
+
+    ASSERT_EQ(current_min, static_cast<DATA_TYPE>(min0.get()));
+    ASSERT_EQ(current_min * 2, static_cast<DATA_TYPE>(min1.get()));
+    ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
+
+  }
+
+  min0.reset(default_val); 
+  min1.reset(default_val); 
+  min2.reset(big_val); 
+
+  const int nloops_b = 4;
+  for (int j = 0; j < nloops_b; ++j) {
+
+    DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
+    IDX_TYPE min_index = static_cast<IDX_TYPE>(dist2(mt));
+
+    if ( test_array[min_index] > roll ) {
+      test_array[min_index] = roll;
+      current_min = RAJA_MIN(current_min, roll );
+
+      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+    }
+
+    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      min0.min(working_array[idx]);
+      min1.min(2 * working_array[idx]);
+      min2.min(working_array[idx]);    
+    });
+
+    ASSERT_EQ(current_min, static_cast<DATA_TYPE>(min0.get()));
+    ASSERT_EQ(current_min * 2, static_cast<DATA_TYPE>(min1.get()));
+    ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
+
+  }
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallReduceMinMultipleTest);
+template <typename T>
+class ForallReduceMinMultipleTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallReduceMinMultipleTest, ReduceMinMultipleForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  ForallReduceMinMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                  EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinMultipleTest,
+                            ReduceMinMultipleForall);
+
+#endif  // __TEST_FORALL_MULTIPLE_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
new file mode 100644
index 0000000000..47faa5683b
--- /dev/null
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
@@ -0,0 +1,154 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_MULTIPLE_REDUCEMINLOC_HPP__
+#define __TEST_FORALL_MULTIPLE_REDUCEMINLOC_HPP__
+
+#include <cfloat>
+#include <climits>
+#include <cstdlib>
+#include <numeric>
+#include <random>
+
+template <typename IDX_TYPE,
+          typename DATA_TYPE, typename WORKING_RES,
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, 
+                                        IDX_TYPE last)
+{
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
+
+  IDX_TYPE index_len = last - first;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(last,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const DATA_TYPE default_val = static_cast<DATA_TYPE>(SHRT_MAX);
+  const IDX_TYPE default_loc = -1;
+  const DATA_TYPE big_val = -500;
+
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    test_array[i] = default_val;
+  }
+
+  
+  static std::random_device rd;
+  static std::mt19937 mt(rd());
+  static std::uniform_real_distribution<double> dist(-100, 100);
+  static std::uniform_real_distribution<double> dist2(0, index_len - 1);
+
+  DATA_TYPE current_min    = default_val;
+  IDX_TYPE current_loc = default_loc;
+
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min0(default_val, default_loc);;
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min1(default_val, default_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min2(big_val, default_loc);
+
+  const int nloops = 8;
+  for (int j = 0; j < nloops; ++j) {
+
+    DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
+    IDX_TYPE min_index = static_cast<IDX_TYPE>(dist2(mt));
+
+    if ( current_min > roll && test_array[min_index] > roll ) {
+      test_array[min_index] = roll;
+      current_min = RAJA_MIN( current_min, roll );
+      current_loc = min_index;
+
+      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+    }
+
+    working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+
+    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      min0.minloc(working_array[idx], idx);
+      min1.minloc(2 * working_array[idx], idx);
+      min2.minloc(working_array[idx], idx);
+    });
+
+    ASSERT_EQ(current_min, static_cast<DATA_TYPE>(min0.get()));
+    ASSERT_EQ(current_loc, static_cast<IDX_TYPE>(min0.getLoc()));
+
+    ASSERT_EQ(current_min * 2, static_cast<DATA_TYPE>(min1.get()));
+    ASSERT_EQ(current_loc, static_cast<IDX_TYPE>(min1.getLoc()));
+
+    ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
+    ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(min2.getLoc()));
+
+  }
+
+  min0.reset(default_val, default_loc); 
+  min1.reset(default_val, default_loc); 
+  min2.reset(big_val, default_loc); 
+
+  const int nloops_b = 4;
+  for (int j = 0; j < nloops_b; ++j) {
+
+    DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
+    IDX_TYPE min_index = static_cast<IDX_TYPE>(dist2(mt));
+
+    if ( current_min > roll && test_array[min_index] > roll ) {
+      test_array[min_index] = roll;
+      current_min = RAJA_MIN( current_min, roll );
+      current_loc = min_index;
+
+      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+    }
+
+    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      min0.minloc(working_array[idx], idx);
+      min1.minloc(2 * working_array[idx], idx);
+      min2.minloc(working_array[idx], idx);    
+    });
+
+    ASSERT_EQ(current_min, static_cast<DATA_TYPE>(min0.get()));
+    ASSERT_EQ(current_loc, static_cast<IDX_TYPE>(min0.getLoc()));
+
+    ASSERT_EQ(current_min * 2, static_cast<DATA_TYPE>(min1.get()));
+    ASSERT_EQ(current_loc, static_cast<IDX_TYPE>(min1.getLoc()));
+
+    ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
+    ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(min2.getLoc()));
+
+  }
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallReduceMinLocMultipleTest);
+template <typename T>
+class ForallReduceMinLocMultipleTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallReduceMinLocMultipleTest, ReduceMinLocMultipleForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  ForallReduceMinLocMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                     EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocMultipleTest,
+                            ReduceMinLocMultipleForall);
+
+#endif  // __TEST_FORALL_MULTIPLE_REDUCEMINLOC_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
new file mode 100644
index 0000000000..2d47d851a2
--- /dev/null
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
@@ -0,0 +1,191 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_MULTIPLE_REDUCESUM_HPP__
+#define __TEST_FORALL_MULTIPLE_REDUCESUM_HPP__
+
+#include <cstdlib>
+#include <numeric>
+
+template <typename IDX_TYPE,
+          typename DATA_TYPE, typename WORKING_RES,
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first, 
+                                              IDX_TYPE last)
+{
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(last,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const DATA_TYPE initval = 2;
+
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    test_array[i] = initval;
+  }
+
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+
+
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum0(0);
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum1(initval * 1);
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum2(0);
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum3(initval * 3);
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum4(0);
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum5(initval * 5);
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum6(0);
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum7(initval * 7);
+
+  const DATA_TYPE index_len = static_cast<DATA_TYPE>(last - first);
+
+  const int nloops = 2;
+  for (int j = 0; j < nloops; ++j) {
+
+    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      sum0 += working_array[idx];
+      sum1 += working_array[idx] * 2;
+      sum2 += working_array[idx] * 3;
+      sum3 += working_array[idx] * 4;
+      sum4 += working_array[idx] * 5;
+      sum5 += working_array[idx] * 6;
+      sum6 += working_array[idx] * 7;
+      sum7 += working_array[idx] * 8;
+    });
+
+    DATA_TYPE check_val = initval * index_len * (j + 1);
+
+    ASSERT_EQ(1 * check_val, static_cast<DATA_TYPE>(sum0.get()));
+    ASSERT_EQ(2 * check_val + (initval*1), static_cast<DATA_TYPE>(sum1.get()));
+    ASSERT_EQ(3 * check_val, static_cast<DATA_TYPE>(sum2.get()));
+    ASSERT_EQ(4 * check_val + (initval*3), static_cast<DATA_TYPE>(sum3.get()));
+    ASSERT_EQ(5 * check_val, static_cast<DATA_TYPE>(sum4.get()));
+    ASSERT_EQ(6 * check_val + (initval*5), static_cast<DATA_TYPE>(sum5.get()));
+    ASSERT_EQ(7 * check_val, static_cast<DATA_TYPE>(sum6.get()));
+    ASSERT_EQ(8 * check_val + (initval*7), static_cast<DATA_TYPE>(sum7.get()));
+
+  }
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+template <typename IDX_TYPE,
+          typename DATA_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first, 
+			                       IDX_TYPE last)
+{
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(last,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const DATA_TYPE initval = 2;
+
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    test_array[i] = initval;
+  }
+
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
+
+  const DATA_TYPE index_len = static_cast<DATA_TYPE>(last - first);
+
+
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum0(initval);
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum1;
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum2(initval);
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum3;
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum4(initval);
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum5;
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum6(initval);
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum7;
+
+  sum0.reset(0);
+  sum1.reset(initval * 1);
+  sum2.reset(0);
+  sum3.reset(initval * 3);
+  sum4.reset(0.0);
+  sum5.reset(initval * 5);
+  sum6.reset(0.0);
+  sum7.reset(initval * 7);
+
+  const int nloops = 3;
+  for (int j = 0; j < nloops; ++j) {
+
+    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      sum0 += working_array[idx];
+      sum1 += working_array[idx] * 2;
+      sum2 += working_array[idx] * 3;
+      sum3 += working_array[idx] * 4;
+      sum4 += working_array[idx] * 5;
+      sum5 += working_array[idx] * 6;
+      sum6 += working_array[idx] * 7;
+      sum7 += working_array[idx] * 8;
+    });
+
+    DATA_TYPE check_val = initval * index_len * (j + 1);
+
+    ASSERT_EQ(1 * check_val, static_cast<DATA_TYPE>(sum0.get()));
+    ASSERT_EQ(2 * check_val + (initval*1), static_cast<DATA_TYPE>(sum1.get()));
+    ASSERT_EQ(3 * check_val, static_cast<DATA_TYPE>(sum2.get()));
+    ASSERT_EQ(4 * check_val + (initval*3), static_cast<DATA_TYPE>(sum3.get()));
+    ASSERT_EQ(5 * check_val, static_cast<DATA_TYPE>(sum4.get()));
+    ASSERT_EQ(6 * check_val + (initval*5), static_cast<DATA_TYPE>(sum5.get()));
+    ASSERT_EQ(7 * check_val, static_cast<DATA_TYPE>(sum6.get()));
+    ASSERT_EQ(8 * check_val + (initval*7), static_cast<DATA_TYPE>(sum7.get()));
+
+  }
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallReduceSumMultipleTest);
+template <typename T>
+class ForallReduceSumMultipleTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallReduceSumMultipleTest, ReduceSumMultipleForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  ForallReduceSumMultipleStaggeredTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                           EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+
+  ForallReduceSumMultipleStaggered2TestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                            EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumMultipleTest,
+                            ReduceSumMultipleForall);
+
+#endif  // __TEST_FORALL_MULTIPLE_REDUCESUM_HPP__
diff --git a/test/functional/forall/region/CMakeLists.txt b/test/functional/forall/region/CMakeLists.txt
new file mode 100644
index 0000000000..313e163878
--- /dev/null
+++ b/test/functional/forall/region/CMakeLists.txt
@@ -0,0 +1,28 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+list(APPEND FORALL_REGION_BACKENDS Sequential)
+
+if(RAJA_ENABLE_OPENMP)
+  list(APPEND FORALL_REGION_BACKENDS OpenMP)
+endif()
+
+
+#
+# Generate tests for each enabled RAJA back-end.
+#
+foreach( REGION_BACKEND ${FORALL_REGION_BACKENDS} )
+  configure_file( test-forall-region.cpp.in
+                  test-forall-region-${REGION_BACKEND}.cpp )
+  raja_add_test( NAME test-forall-region-${REGION_BACKEND}
+                 SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-region-${REGION_BACKEND}.cpp )
+
+  target_include_directories(test-forall-region-${REGION_BACKEND}.exe
+                             PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+endforeach()
+
+unset( FORALL_REGION_BACKENDS )
diff --git a/test/functional/forall/region/test-forall-region.cpp.in b/test/functional/forall/region/test-forall-region.cpp.in
new file mode 100644
index 0000000000..3733a74fb9
--- /dev/null
+++ b/test/functional/forall/region/test-forall-region.cpp.in
@@ -0,0 +1,61 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-forall-region.hpp"
+
+
+//
+// Region and exec pols for forall region tests
+//
+
+using SequentialRegionPols = camp::list< RAJA::seq_region >;
+
+using SequentialForallRegionExecPols = SequentialForallExecPols;
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+using OpenMPRegionPols = camp::list< RAJA::omp_parallel_region >;
+
+using OpenMPForallRegionExecPols =
+  camp::list< RAJA::omp_for_nowait_exec,
+              RAJA::omp_for_exec >;
+
+#endif
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @REGION_BACKEND@ForallRegionTypes =
+  Test< camp::cartesian_product<IdxTypeList,
+                                @REGION_BACKEND@ResourceList,
+                                @REGION_BACKEND@RegionPols,
+                                @REGION_BACKEND@ForallRegionExecPols>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@REGION_BACKEND@,
+                               ForallRegionTest,
+                               @REGION_BACKEND@ForallRegionTypes);
+
+
+
diff --git a/test/functional/forall/region/tests/test-forall-region.hpp b/test/functional/forall/region/tests/test-forall-region.hpp
new file mode 100644
index 0000000000..dcf237dee0
--- /dev/null
+++ b/test/functional/forall/region/tests/test-forall-region.hpp
@@ -0,0 +1,92 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_REGION_HPP__
+#define __TEST_FORALL_REGION_HPP__
+
+#include <numeric>
+#include <vector>
+
+template <typename INDEX_TYPE, typename WORKING_RES, 
+          typename REG_POLICY, typename EXEC_POLICY>
+void ForallRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
+{
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  //
+  // Set some local variables and create some segments for using in tests
+  //
+  const INDEX_TYPE N = last - first;
+  
+  RAJA::TypedRangeSegment<INDEX_TYPE> rseg(first, last);
+
+  std::vector<INDEX_TYPE> idx_array(N);
+  std::iota(&idx_array[0], &idx_array[0] + N, first);
+
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], N,
+                                          working_res);
+
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  working_res.memset( working_array, 0, sizeof(INDEX_TYPE) * N );
+
+  RAJA::region<REG_POLICY>([=]() {
+
+    RAJA::forall<EXEC_POLICY>(rseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+      working_array[idx - first] += 1;
+    });
+
+    RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+      working_array[idx - first] += 2; 
+    });
+
+  });
+
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
+
+  for (INDEX_TYPE i = 0; i < N; i++) {
+    ASSERT_EQ(check_array[i], 3);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+
+TYPED_TEST_SUITE_P(ForallRegionTest);
+template <typename T>
+class ForallRegionTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallRegionTest, RegionForall)
+{
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using REG_POLICY  = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+
+  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(0, 25);
+  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(1, 153);
+  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(3, 2556);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallRegionTest,
+                            RegionForall);
+
+#endif  // __TEST_FORALL_REGION_HPP__
diff --git a/test/functional/forall/resource-indexset/CMakeLists.txt b/test/functional/forall/resource-indexset/CMakeLists.txt
new file mode 100644
index 0000000000..94582b53ed
--- /dev/null
+++ b/test/functional/forall/resource-indexset/CMakeLists.txt
@@ -0,0 +1,31 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of indexset test types for generating test files.
+#
+set(INDEXSETTESTTYPES ResourceIndexSet ResourceIcountIndexSet)
+
+
+#
+# Generate tests for each enabled RAJA back-end.
+#
+# Note: FORALL_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( BACKEND ${FORALL_BACKENDS} )
+  foreach( INDEXSETTESTTYPE ${INDEXSETTESTTYPES} )
+    configure_file( test-forall-resource-indexset.cpp.in
+                    test-forall-${INDEXSETTESTTYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-forall-${INDEXSETTESTTYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-${INDEXSETTESTTYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-forall-${INDEXSETTESTTYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( INDEXSETTESTTYPES )
diff --git a/test/functional/forall/resource-indexset/test-forall-resource-indexset.cpp.in b/test/functional/forall/resource-indexset/test-forall-resource-indexset.cpp.in
new file mode 100644
index 0000000000..8dced37c93
--- /dev/null
+++ b/test/functional/forall/resource-indexset/test-forall-resource-indexset.cpp.in
@@ -0,0 +1,42 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+
+#include "RAJA_test-forall-indexset-execpol.hpp"
+#include "RAJA_test-indexset-build.hpp"
+
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-forall-@INDEXSETTESTTYPE@.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@ForallResourceIndexSetTypes =
+  Test< camp::cartesian_product<IdxTypeList,
+                                @BACKEND@ResourceList,
+                                @BACKEND@ForallIndexSetExecPols>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               Forall@INDEXSETTESTTYPE@Test,
+                               @BACKEND@ForallResourceIndexSetTypes);
diff --git a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
new file mode 100644
index 0000000000..ec948ee949
--- /dev/null
+++ b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
@@ -0,0 +1,98 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_RESOURCE_ICOUNT_INDEXSET_HPP__
+#define __TEST_FORALL_RESOURCE_ICOUNT_INDEXSET_HPP__
+
+#include <cstdio>
+#include <algorithm>
+#include <vector>
+
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+void ForallResourceIcountIndexSetTestImpl()
+{
+
+  using RangeSegType       = RAJA::TypedRangeSegment<INDEX_TYPE>;
+  using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
+  using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
+
+  using IndexSetType = 
+   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+
+  WORKING_RES working_res;
+  camp::resources::Resource erased_working_res{working_res};
+
+  IndexSetType iset; 
+  std::vector<INDEX_TYPE> is_indices; 
+  buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
+    iset, is_indices, erased_working_res);
+
+  //
+  // Working array length
+  //
+  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+
+  //
+  // Allocate and initialize arrays used in testing
+  //  
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     erased_working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
+
+  INDEX_TYPE ticount = 0;
+  for (size_t i = 0; i < is_indices.size(); ++i) {
+    test_array[ ticount++ ] = is_indices[i];
+  }
+
+  RAJA::forall_Icount<EXEC_POLICY>(working_res, iset,
+    [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
+    working_array[icount] = idx;
+  });
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
+
+  for (INDEX_TYPE i = 0; i < N; i++) {
+    ASSERT_EQ(test_array[i], check_array[i]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+
+TYPED_TEST_SUITE_P(ForallResourceIcountIndexSetTest);
+template <typename T>
+class ForallResourceIcountIndexSetTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallResourceIcountIndexSetTest, ResourceIndexSetForallIcount)
+{
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ForallResourceIcountIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallResourceIcountIndexSetTest,
+                            ResourceIndexSetForallIcount);
+
+#endif  // __TEST_FORALL_RESOURCE_ICOUNT_INDEXSET_HPP__
diff --git a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
new file mode 100644
index 0000000000..6ccdcc06c1
--- /dev/null
+++ b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
@@ -0,0 +1,96 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_RESOURCE_INDEXSET_HPP__
+#define __TEST_FORALL_RESOURCE_INDEXSET_HPP__
+
+#include <cstdio>
+#include <algorithm>
+#include <vector>
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+void ForallResourceIndexSetTestImpl()
+{
+
+  using RangeSegType       = RAJA::TypedRangeSegment<INDEX_TYPE>;
+  using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
+  using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
+
+  using IndexSetType = 
+   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+
+  WORKING_RES working_res;
+  camp::resources::Resource erased_working_res{working_res};
+
+  IndexSetType iset; 
+  std::vector<INDEX_TYPE> is_indices; 
+  buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
+    iset, is_indices, erased_working_res);
+
+  //
+  // Working array length
+  //
+  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+
+  //
+  // Allocate and initialize arrays used in testing
+  //  
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     erased_working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
+
+  for (size_t i = 0; i < is_indices.size(); ++i) {
+    test_array[ is_indices[i] ] = is_indices[i];
+  }
+
+  RAJA::forall<EXEC_POLICY>(working_res, iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    working_array[idx] = idx;
+  });
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
+
+  // 
+  for (INDEX_TYPE i = 0; i < N; i++) {
+    ASSERT_EQ(test_array[i], check_array[i]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+
+TYPED_TEST_SUITE_P(ForallResourceIndexSetTest);
+template <typename T>
+class ForallResourceIndexSetTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallResourceIndexSetTest, ResourceIndexSetForall)
+{
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ForallResourceIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallResourceIndexSetTest,
+                            ResourceIndexSetForall);
+
+#endif  // __TEST_FORALL_INDEXSET_HPP__
diff --git a/test/functional/forall/resource-segment/CMakeLists.txt b/test/functional/forall/resource-segment/CMakeLists.txt
new file mode 100644
index 0000000000..156cf70504
--- /dev/null
+++ b/test/functional/forall/resource-segment/CMakeLists.txt
@@ -0,0 +1,31 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of segment types for generating test files.
+#
+set(SEGTYPES ListSegment RangeSegment RangeStrideSegment)
+
+
+#
+# Generate tests for each enabled RAJA back-end. 
+# 
+# Note: FORALL_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( BACKEND ${FORALL_BACKENDS} )
+  foreach( SEGTYPE ${SEGTYPES} )
+    configure_file( test-forall-resource-segment.cpp.in
+                    test-forall-resource-${SEGTYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-forall-resource-${SEGTYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-resource-${SEGTYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-forall-resource-${SEGTYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( SEGTYPES )
diff --git a/test/functional/forall/resource-segment/test-forall-resource-segment.cpp.in b/test/functional/forall/resource-segment/test-forall-resource-segment.cpp.in
new file mode 100644
index 0000000000..24317c00a5
--- /dev/null
+++ b/test/functional/forall/resource-segment/test-forall-resource-segment.cpp.in
@@ -0,0 +1,40 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-forall-resource-@SEGTYPE@.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@ForallResourceSegmentTypes =
+  Test< camp::cartesian_product<StrongIdxTypeList,
+                                @BACKEND@ResourceList,
+                                @BACKEND@ForallExecPols>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               ForallResource@SEGTYPE@Test,
+                               @BACKEND@ForallResourceSegmentTypes);
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
new file mode 100644
index 0000000000..3b8fa76249
--- /dev/null
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
@@ -0,0 +1,103 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_RESOURCE_LISTSEGMENT_HPP__
+#define __TEST_FORALL_RESOURCE_LISTSEGMENT_HPP__
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <algorithm>
+#include <numeric>
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+void ForallResourceListSegmentTestImpl(INDEX_TYPE N)
+{
+
+  // Create and initialize indices in idx_array used to create list segment
+  std::vector<INDEX_TYPE> idx_array;
+
+  srand ( time(NULL) );
+
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
+    INDEX_TYPE randval = INDEX_TYPE(rand() % RAJA::stripIndexType(N));
+    if ( i < randval ) {
+      idx_array.push_back(i);
+    }     
+  }
+
+  size_t idxlen = idx_array.size();
+
+  WORKING_RES working_res;
+  camp::resources::Resource erased_working_res{working_res};
+
+  // Create list segment for tests
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, 
+                                          erased_working_res);
+
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     erased_working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    test_array[RAJA::stripIndexType(i)] = INDEX_TYPE(0);
+  }
+
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+
+  for (size_t i = 0; i < idxlen; ++i) {
+    test_array[ RAJA::stripIndexType(idx_array[i]) ] = idx_array[i];
+  }
+
+  RAJA::forall<EXEC_POLICY>(working_res, lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    working_array[RAJA::stripIndexType(idx)] = idx;
+  }); 
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+
+  // 
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+
+TYPED_TEST_SUITE_P(ForallResourceListSegmentTest);
+template <typename T>
+class ForallResourceListSegmentTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallResourceListSegmentTest, ResourceListSegmentForall)
+{
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(13));
+
+  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(2047));
+
+  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(32000));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallResourceListSegmentTest,
+                            ResourceListSegmentForall);
+
+#endif  // __TEST_FORALL_RESOURCE_LISTSEGMENT_HPP__
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
new file mode 100644
index 0000000000..784ae52cda
--- /dev/null
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
@@ -0,0 +1,89 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_RESOURCE_RANGESEGMENT_HPP__
+#define __TEST_FORALL_RESOURCE_RANGESEGMENT_HPP__
+
+#include <numeric>
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+void ForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
+{
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  INDEX_TYPE N = INDEX_TYPE(r1.end() - r1.begin());
+
+  WORKING_RES working_res;
+  camp::resources::Resource erased_working_res{working_res};
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     erased_working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  const INDEX_TYPE rbegin = *r1.begin();
+
+  std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
+
+  RAJA::forall<EXEC_POLICY>(working_res, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
+  });
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+
+TYPED_TEST_SUITE_P(ForallResourceRangeSegmentTest);
+template <typename T>
+class ForallResourceRangeSegmentTest : public ::testing::Test
+{
+};
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+void runNegativeTests()
+{
+}
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+void runNegativeTests()
+{
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5));
+}
+
+
+TYPED_TEST_P(ForallResourceRangeSegmentTest, ResourceRangeSegmentForall)
+{
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
+
+  runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallResourceRangeSegmentTest,
+                            ResourceRangeSegmentForall);
+
+#endif  // __TEST_FORALL_RESOURCE_RANGESEGMENT_HPP__
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
new file mode 100644
index 0000000000..ad5709682c
--- /dev/null
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
@@ -0,0 +1,112 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_RESOURCE_RANGESTRIDESEGMENT_HPP__
+#define __TEST_FORALL_RESOURCE_RANGESTRIDESEGMENT_HPP__
+
+template <typename INDEX_TYPE, typename DIFF_TYPE, 
+          typename WORKING_RES, typename EXEC_POLICY>
+void ForallResourceRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
+                                      DIFF_TYPE stride)
+{
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
+  INDEX_TYPE N = INDEX_TYPE(r1.size());
+
+  WORKING_RES working_res;
+  camp::resources::Resource erased_working_res{working_res};
+  camp::resources::Resource host_res{camp::resources::Host()};
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     erased_working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    test_array[RAJA::stripIndexType(i)] = INDEX_TYPE(0);
+  }
+
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N)); 
+
+  INDEX_TYPE idx = first;
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
+    test_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
+    idx += stride; 
+  }
+
+  RAJA::forall<EXEC_POLICY>(working_res, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    working_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
+  });
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+
+TYPED_TEST_SUITE_P(ForallResourceRangeStrideSegmentTest);
+template <typename T>
+class ForallResourceRangeStrideSegmentTest : public ::testing::Test
+{
+};
+
+template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+void runNegativeStrideTests()
+{
+}
+
+template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+void runNegativeStrideTests()
+{
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
+
+// Test negative strides
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
+}
+
+
+TYPED_TEST_P(ForallResourceRangeStrideSegmentTest, ResourceRangeStrideSegmentForall)
+{
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DIFF_TYPE   = typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
+
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
+
+// Test size zero segments
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
+
+  runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallResourceRangeStrideSegmentTest,
+                            ResourceRangeStrideSegmentForall);
+
+#endif  // __TEST_FORALL_RANGESTRIDESEGMENT_HPP__
diff --git a/test/functional/forall/segment-view/CMakeLists.txt b/test/functional/forall/segment-view/CMakeLists.txt
new file mode 100644
index 0000000000..c9be5b90d2
--- /dev/null
+++ b/test/functional/forall/segment-view/CMakeLists.txt
@@ -0,0 +1,31 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of segment types for generating test files.
+#
+set(SEGVIEWTYPES ListSegmentView RangeSegmentView RangeSegment2DView RangeStrideSegmentView)
+
+
+#
+# Generate tests for each enabled RAJA back-end.
+#
+# Note: FORALL_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( BACKEND ${FORALL_BACKENDS} )
+  foreach( SEGVIEWTYPE ${SEGVIEWTYPES} )
+    configure_file( test-forall-segment-view.cpp.in
+                    test-forall-${SEGVIEWTYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-forall-${SEGVIEWTYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-${SEGVIEWTYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-forall-${SEGVIEWTYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( SEGVIEWTYPES )
diff --git a/test/functional/forall/segment-view/test-forall-segment-view.cpp.in b/test/functional/forall/segment-view/test-forall-segment-view.cpp.in
new file mode 100644
index 0000000000..a4858668d0
--- /dev/null
+++ b/test/functional/forall/segment-view/test-forall-segment-view.cpp.in
@@ -0,0 +1,40 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-forall-@SEGVIEWTYPE@.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@ForallSegmentViewTypes =
+  Test< camp::cartesian_product<IdxTypeList,
+                                @BACKEND@ResourceList,
+                                @BACKEND@ForallExecPols>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               Forall@SEGVIEWTYPE@Test,
+                               @BACKEND@ForallSegmentViewTypes);
diff --git a/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
new file mode 100644
index 0000000000..3b8cb19916
--- /dev/null
+++ b/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
@@ -0,0 +1,168 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_LISTSEGMENTVIEW_HPP__
+#define __TEST_FORALL_LISTSEGMENTVIEW_HPP__
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <algorithm>
+#include <numeric>
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+void ForallListSegmentViewTestImpl(INDEX_TYPE N)
+{
+
+  // Create and initialize indices in idx_array used to create list segment
+  std::vector<INDEX_TYPE> idx_array;
+
+  srand ( time(NULL) );
+
+  for (INDEX_TYPE i = 0; i < N; ++i) {
+    INDEX_TYPE randval = rand() % N;
+    if ( i < randval ) {
+      idx_array.push_back(i);
+    }     
+  }
+
+  size_t idxlen = idx_array.size();
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, 
+                                          working_res);
+
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
+
+  for (size_t i = 0; i < idxlen; ++i) {
+    test_array[ idx_array[i] ] = idx_array[i];
+  }
+
+  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >;
+  
+  RAJA::Layout<1> layout(N);
+  view_type work_view(working_array, layout);
+
+  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    work_view( idx ) = idx;
+  });
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
+
+  for (INDEX_TYPE i = 0; i < N; i++) {
+    ASSERT_EQ(test_array[i], check_array[i]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+void ForallListSegmentOffsetViewTestImpl(INDEX_TYPE N, INDEX_TYPE offset)
+{
+
+  // Create and initialize indices in idx_array used to create list segment
+  std::vector<INDEX_TYPE> idx_array;
+
+  srand ( time(NULL) );
+
+  for (INDEX_TYPE i = 0; i < N; ++i) {
+    INDEX_TYPE randval = rand() % N;
+    if ( i < randval ) {
+      idx_array.push_back(i+offset);
+    }     
+  }
+
+  size_t idxlen = idx_array.size();
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, 
+                                          working_res);
+
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
+
+  for (size_t i = 0; i < idxlen; ++i) {
+    test_array[ idx_array[i]-offset ] = idx_array[i];
+  }
+
+  using view_type = RAJA::View< INDEX_TYPE, RAJA::OffsetLayout<1, INDEX_TYPE> >;
+
+  INDEX_TYPE N_offset = N + offset;
+  view_type work_view(working_array, 
+                      RAJA::make_offset_layout<1, INDEX_TYPE>( {{offset}}, 
+                                                               {{N_offset}} ));
+
+  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    work_view( idx ) = idx;
+  });
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
+
+  for (INDEX_TYPE i = 0; i < N; i++) {
+    ASSERT_EQ(test_array[i], check_array[i]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallListSegmentViewTest);
+template <typename T>
+class ForallListSegmentViewTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallListSegmentViewTest, ListSegmentForallView)
+{
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(13);
+  ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(2047);
+  ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(32000);
+
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(13, 1);
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(2047, 2);
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(32000, 3);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentViewTest,
+                            ListSegmentForallView);
+
+#endif  // __TEST_FORALL_LISTSEGMENTVIEW_HPP__
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
new file mode 100644
index 0000000000..44e606d3ef
--- /dev/null
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
@@ -0,0 +1,149 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_RANGESEGMENT2DVIEW_HPP__
+#define __TEST_FORALL_RANGESEGMENT2DVIEW_HPP__
+
+#include <iostream>
+#include <numeric>
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+void ForallRangeSegment2DViewTestImpl(INDEX_TYPE N)
+{
+  INDEX_TYPE lentot = N * N;
+  const int NDIMS = 2;
+
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, lentot);
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(lentot,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  std::iota(test_array, test_array + lentot, 0);
+
+  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<NDIMS> >;
+  RAJA::Layout<NDIMS> layout(N, N);
+  
+  view_type work_view(working_array, layout);
+
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    const INDEX_TYPE row = idx / N;
+    const INDEX_TYPE col = idx % N;
+    work_view(row, col) = row * N + col;
+  });
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * lentot);
+
+  for (INDEX_TYPE i = 0; i < lentot; i++) {
+    ASSERT_EQ(test_array[i], check_array[i]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+void ForallRangeSegment2DOffsetViewTestImpl(INDEX_TYPE N)
+{
+  const INDEX_TYPE leninterior = N * N;
+  const INDEX_TYPE lentot = (N + 2) * (N + 2);
+  const int NDIMS = 2;
+
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, leninterior);
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(lentot,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  memset( test_array, 0, sizeof(INDEX_TYPE) * lentot ); 
+
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * lentot);
+
+  for (int row = 1; row < N + 1; ++row) {
+    for (int col = 1; col < N + 1; ++col) {
+      int idx = row * (N+2) + col;
+      test_array[ idx ] = (row - 1) * N + (col - 1);
+    }
+  }
+
+  using view_type = RAJA::View< INDEX_TYPE, RAJA::OffsetLayout<NDIMS> >;
+  RAJA::OffsetLayout<NDIMS> layout =
+    RAJA::make_offset_layout<NDIMS>( {{-1, -1}} , {{N, N}} ); 
+  
+  view_type work_view(working_array, layout);
+
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    const INDEX_TYPE row = idx / N;
+    const INDEX_TYPE col = idx % N;
+    work_view(row, col) = idx;  
+  });
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * lentot);
+
+  for (INDEX_TYPE i = 0; i < lentot; i++) {
+    ASSERT_EQ(test_array[i], check_array[i]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallRangeSegment2DViewTest);
+template <typename T>
+class ForallRangeSegment2DViewTest : public ::testing::Test
+{
+};
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* = nullptr>
+void runOffsetViewTests()
+{
+}
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
+void runOffsetViewTests()
+{
+  ForallRangeSegment2DOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(4);
+  ForallRangeSegment2DOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(100);
+}
+
+
+TYPED_TEST_P(ForallRangeSegment2DViewTest, RangeSegmentForall2DView)
+{
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ForallRangeSegment2DViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(4);
+  ForallRangeSegment2DViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(100);
+
+  runOffsetViewTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegment2DViewTest,
+                            RangeSegmentForall2DView);
+
+#endif  // __TEST_FORALL_RANGESEGMENT2DVIEW_HPP__
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
new file mode 100644
index 0000000000..5dbf8e6549
--- /dev/null
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
@@ -0,0 +1,145 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_RANGESEGMENTVIEW_HPP__
+#define __TEST_FORALL_RANGESEGMENTVIEW_HPP__
+
+#include <numeric>
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+void ForallRangeSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last)
+{
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(first, last);
+  INDEX_TYPE N = r1.end() - r1.begin();
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  const INDEX_TYPE rbegin = *r1.begin();
+
+  std::iota(test_array, test_array + N, rbegin);
+
+  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >;
+ 
+  RAJA::Layout<1> layout(N);
+  view_type work_view(working_array, layout);
+
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    work_view( idx - rbegin ) = idx;
+  }); 
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
+
+  for (INDEX_TYPE i = 0; i < N; i++) {
+    ASSERT_EQ(test_array[i], check_array[i]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+void ForallRangeSegmentOffsetViewTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
+                                          INDEX_TYPE offset)
+{
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(first+offset, last+offset);
+  INDEX_TYPE N = r1.end() - r1.begin();
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  const INDEX_TYPE rbegin = *r1.begin();
+
+  std::iota(test_array, test_array + N, rbegin);
+
+  using view_type = RAJA::View< INDEX_TYPE, RAJA::OffsetLayout<1, INDEX_TYPE> >;
+
+  INDEX_TYPE f_offset = first + offset;
+  INDEX_TYPE l_offset = last + offset;
+  view_type work_view(working_array, 
+                      RAJA::make_offset_layout<1, INDEX_TYPE>({{f_offset}},
+                                                              {{l_offset}}));
+
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    work_view( idx ) = idx;
+  });
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
+
+  for (INDEX_TYPE i = 0; i < N; i++) {
+    ASSERT_EQ(test_array[i], check_array[i]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* = nullptr>
+void runNegativeViewTests()
+{
+}
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
+void runNegativeViewTests()
+{
+  ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 0);
+  ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 5);
+
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 0, 1);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 5, 2);
+}
+
+
+TYPED_TEST_SUITE_P(ForallRangeSegmentViewTest);
+template <typename T>
+class ForallRangeSegmentViewTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallRangeSegmentViewTest, RangeSegmentForallView)
+{
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(0, 5);
+  ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 5);
+  ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 255);
+
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(0, 5, 1);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 5, 2);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 255, 3);
+
+  runNegativeViewTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentViewTest,
+                            RangeSegmentForallView);
+
+#endif  // __TEST_FORALL_RANGESEGMENTVIEW_HPP__
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
new file mode 100644
index 0000000000..b4ed07d3c9
--- /dev/null
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
@@ -0,0 +1,111 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
+#define __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
+
+template <typename INDEX_TYPE, typename DIFF_TYPE, 
+          typename WORKING_RES, typename EXEC_POLICY>
+void ForallRangeStrideSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
+                                          DIFF_TYPE stride)
+{
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(first, last, stride);
+  INDEX_TYPE N = r1.size();
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  memset( test_array, 0, sizeof(INDEX_TYPE) * N );
+
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
+
+  INDEX_TYPE idx = first;
+  for (INDEX_TYPE i = 0; i < N; ++i) {
+    test_array[ (idx-first)/stride ] = idx;
+    idx += stride;
+  }
+
+  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >;
+
+  RAJA::Layout<1> layout(N);
+  view_type work_view(working_array, layout);
+
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    work_view( (idx-first)/stride ) = idx;
+  });
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
+
+  for (INDEX_TYPE i = 0; i < N; i++) {
+    ASSERT_EQ(test_array[i], check_array[i]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* = nullptr>
+void runNegativeIndexViewTests()
+{
+}
+
+template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
+void runNegativeIndexViewTests()
+{
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-10, -1, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-5, 0, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-5, 5, 3);
+
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(10, -1, -1);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(10, 0, -2);
+}
+
+
+TYPED_TEST_SUITE_P(ForallRangeStrideSegmentViewTest);
+template <typename T>
+class ForallRangeStrideSegmentViewTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallRangeStrideSegmentViewTest, RangeStrideSegmentForallView)
+{
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DIFF_TYPE   = typename std::make_signed<INDEX_TYPE>::type;
+
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, 1);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, 1);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 21, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 21, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 255, 2);
+
+// Test size zero segments
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, -2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, -2);
+
+  runNegativeIndexViewTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallRangeStrideSegmentViewTest,
+                            RangeStrideSegmentForallView);
+
+#endif  // __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
diff --git a/test/functional/forall/segment/CMakeLists.txt b/test/functional/forall/segment/CMakeLists.txt
new file mode 100644
index 0000000000..e5a7a63cc0
--- /dev/null
+++ b/test/functional/forall/segment/CMakeLists.txt
@@ -0,0 +1,31 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of segment types for generating test files.
+#
+set(SEGTYPES ListSegment RangeSegment RangeStrideSegment)
+
+
+#
+# Generate tests for each enabled RAJA back-end. 
+# 
+# Note: FORALL_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( BACKEND ${FORALL_BACKENDS} )
+  foreach( SEGTYPE ${SEGTYPES} )
+    configure_file( test-forall-segment.cpp.in
+                    test-forall-${SEGTYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-forall-${SEGTYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-${SEGTYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-forall-${SEGTYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( SEGTYPES )
diff --git a/test/functional/forall/segment/test-forall-segment.cpp.in b/test/functional/forall/segment/test-forall-segment.cpp.in
new file mode 100644
index 0000000000..4fa51de073
--- /dev/null
+++ b/test/functional/forall/segment/test-forall-segment.cpp.in
@@ -0,0 +1,40 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-forall-@SEGTYPE@.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@ForallSegmentTypes =
+  Test< camp::cartesian_product<StrongIdxTypeList,
+                                @BACKEND@ResourceList,
+                                @BACKEND@ForallExecPols>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               Forall@SEGTYPE@Test,
+                               @BACKEND@ForallSegmentTypes);
diff --git a/test/functional/forall/segment/tests/test-forall-ListSegment.hpp b/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
new file mode 100644
index 0000000000..a0dc70853f
--- /dev/null
+++ b/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
@@ -0,0 +1,102 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_LISTSEGMENT_HPP__
+#define __TEST_FORALL_LISTSEGMENT_HPP__
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <algorithm>
+#include <numeric>
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+void ForallListSegmentTestImpl(INDEX_TYPE N)
+{
+
+  // Create and initialize indices in idx_array used to create list segment
+  std::vector<INDEX_TYPE> idx_array;
+
+  srand ( time(NULL) );
+
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
+    INDEX_TYPE randval = INDEX_TYPE(rand() % RAJA::stripIndexType(N));
+    if ( i < randval ) {
+      idx_array.push_back(i);
+    }     
+  }
+
+  size_t idxlen = idx_array.size();
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  // Create list segment for tests
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, 
+                                          working_res);
+
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    test_array[RAJA::stripIndexType(i)] = INDEX_TYPE(0);
+  }
+
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+
+  for (size_t i = 0; i < idxlen; ++i) {
+    test_array[ RAJA::stripIndexType(idx_array[i]) ] = idx_array[i];
+  }
+
+  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    working_array[RAJA::stripIndexType(idx)] = idx;
+  }); 
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+
+  // 
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+
+TYPED_TEST_SUITE_P(ForallListSegmentTest);
+template <typename T>
+class ForallListSegmentTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallListSegmentTest, ListSegmentForall)
+{
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(13));
+
+  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(2047));
+
+  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(32000));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentTest,
+                            ListSegmentForall);
+
+#endif  // __TEST_FORALL_LISTSEGMENT_HPP__
diff --git a/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp b/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
new file mode 100644
index 0000000000..cc0cbc46d6
--- /dev/null
+++ b/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
@@ -0,0 +1,88 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_RANGESEGMENT_HPP__
+#define __TEST_FORALL_RANGESEGMENT_HPP__
+
+#include <numeric>
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
+{
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  INDEX_TYPE N = INDEX_TYPE(r1.end() - r1.begin());
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  const INDEX_TYPE rbegin = *r1.begin();
+
+  std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
+
+  RAJA::forall(EXEC_POLICY(), r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
+  });
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+
+TYPED_TEST_SUITE_P(ForallRangeSegmentTest);
+template <typename T>
+class ForallRangeSegmentTest : public ::testing::Test
+{
+};
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+void runNegativeTests()
+{
+}
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+void runNegativeTests()
+{
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5));
+}
+
+
+TYPED_TEST_P(ForallRangeSegmentTest, RangeSegmentForall)
+{
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
+
+  runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentTest,
+                            RangeSegmentForall);
+
+#endif  // __TEST_FORALL_RANGESEGMENT_HPP__
diff --git a/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp b/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
new file mode 100644
index 0000000000..9944cbe9c3
--- /dev/null
+++ b/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
@@ -0,0 +1,111 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_RANGESTRIDESEGMENT_HPP__
+#define __TEST_FORALL_RANGESTRIDESEGMENT_HPP__
+
+template <typename INDEX_TYPE, typename DIFF_TYPE, 
+          typename WORKING_RES, typename EXEC_POLICY>
+void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
+                                      DIFF_TYPE stride)
+{
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
+  INDEX_TYPE N = INDEX_TYPE(r1.size());
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res{camp::resources::Host()};
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    test_array[RAJA::stripIndexType(i)] = INDEX_TYPE(0);
+  }
+
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N)); 
+
+  INDEX_TYPE idx = first;
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
+    test_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
+    idx += stride; 
+  }
+
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    working_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
+  });
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+
+TYPED_TEST_SUITE_P(ForallRangeStrideSegmentTest);
+template <typename T>
+class ForallRangeStrideSegmentTest : public ::testing::Test
+{
+};
+
+template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+void runNegativeStrideTests()
+{
+}
+
+template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+void runNegativeStrideTests()
+{
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
+
+// Test negative strides
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
+}
+
+
+TYPED_TEST_P(ForallRangeStrideSegmentTest, RangeStrideSegmentForall)
+{
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DIFF_TYPE   = typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
+
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
+
+// Test size zero segments
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
+
+  runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallRangeStrideSegmentTest,
+                            RangeStrideSegmentForall);
+
+#endif  // __TEST_FORALL_RANGESTRIDESEGMENT_HPP__
diff --git a/test/functional/indexset-build/CMakeLists.txt b/test/functional/indexset-build/CMakeLists.txt
new file mode 100644
index 0000000000..8571bfd410
--- /dev/null
+++ b/test/functional/indexset-build/CMakeLists.txt
@@ -0,0 +1,11 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+raja_add_test(
+  NAME test-aligned-indexset
+  SOURCES test-aligned-indexset.cpp)
+
diff --git a/test/functional/indexset-build/test-aligned-indexset.cpp b/test/functional/indexset-build/test-aligned-indexset.cpp
new file mode 100644
index 0000000000..0409dda25d
--- /dev/null
+++ b/test/functional/indexset-build/test-aligned-indexset.cpp
@@ -0,0 +1,81 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing unit tests for IndexSet class.
+///
+
+#include "RAJA_test-base.hpp"
+
+#include "RAJA/index/IndexSetBuilders.hpp" 
+
+#include "camp/resource.hpp"
+
+#include <numeric>
+#include <vector>
+
+TEST(IndexSetBuild, Aligned)
+{
+  const RAJA::Index_type range_min_length = 8;
+  const RAJA::Index_type range_align = 2;
+
+  using RSType = RAJA::RangeSegment;
+  using LSType = RAJA::ListSegment;
+
+  //
+  // Create index vector containing indices:
+  // {0, 1, ..., 15,  17, 18,  20, 21, ..., 27,  29,  30, 31}
+  //
+  std::vector<RAJA::Index_type> indices(16);
+  std::iota(indices.begin(), indices.end(), 0);
+
+  indices.push_back(17);
+  indices.push_back(18);
+
+  for (RAJA::Index_type i = 20; i < 28; ++i) {
+    indices.push_back(i);
+  }
+
+  indices.push_back(29);
+  indices.push_back(30);
+  indices.push_back(31);
+
+  camp::resources::Resource res{camp::resources::Host()};
+ 
+  RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment> iset;
+
+  RAJA::buildIndexSetAligned(iset, 
+                             res,
+                             &indices[0],
+                             static_cast<RAJA::Index_type>(indices.size()),
+                             range_min_length,
+                             range_align);
+
+  ASSERT_EQ(iset.getLength(), indices.size());
+
+  ASSERT_EQ(iset.size(), 5);
+
+  const RSType& s0 = iset.getSegment<const RSType>(0);
+  ASSERT_EQ(s0.size(), 16);
+  ASSERT_EQ(*s0.begin(), 0);
+
+  const LSType& s1 = iset.getSegment<const LSType>(1);
+  ASSERT_EQ(s1.size(), 2);
+  ASSERT_EQ(*s1.begin(), 17);
+
+  const RSType& s2 = iset.getSegment<const RSType>(2);
+  ASSERT_EQ(s2.size(), 8);
+  ASSERT_EQ(*s2.begin(), 20);
+
+  const LSType& s3 = iset.getSegment<const LSType>(3);
+  ASSERT_EQ(s3.size(), 1);
+  ASSERT_EQ(*s3.begin(), 29);
+
+  const RSType& s4 = iset.getSegment<const RSType>(4);
+  ASSERT_EQ(s4.size(), 2);
+  ASSERT_EQ(*s4.begin(), 30);
+}
diff --git a/test/functional/kernel/CMakeLists.txt b/test/functional/kernel/CMakeLists.txt
new file mode 100644
index 0000000000..eae4d0fb9f
--- /dev/null
+++ b/test/functional/kernel/CMakeLists.txt
@@ -0,0 +1,8 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+add_subdirectory(region)
diff --git a/test/functional/kernel/region/CMakeLists.txt b/test/functional/kernel/region/CMakeLists.txt
new file mode 100644
index 0000000000..47e25945a9
--- /dev/null
+++ b/test/functional/kernel/region/CMakeLists.txt
@@ -0,0 +1,48 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+list(APPEND KERNEL_REGION_BACKENDS Sequential)
+
+if(RAJA_ENABLE_OPENMP)
+  list(APPEND KERNEL_REGION_BACKENDS OpenMP)
+endif()
+
+
+#
+# Generate kernel region basic tests for each enabled RAJA back-end.
+#
+foreach( REGION_BACKEND ${KERNEL_REGION_BACKENDS} )
+  configure_file( test-kernel-region.cpp.in
+                  test-kernel-region-${REGION_BACKEND}.cpp )
+  raja_add_test( NAME test-kernel-region-${REGION_BACKEND}
+                 SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-region-${REGION_BACKEND}.cpp )
+
+  target_include_directories(test-kernel-region-${REGION_BACKEND}.exe
+                             PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+endforeach()
+
+unset( KERNEL_REGION_BACKENDS )
+
+
+#
+# Generate kernel region sync tests for OpenMP.
+#
+if(RAJA_ENABLE_OPENMP)
+
+set(REGION_BACKEND OpenMP)
+
+configure_file( test-kernel-region-sync.cpp.in
+                test-kernel-region-sync-${REGION_BACKEND}.cpp )
+raja_add_test( NAME test-kernel-region-sync-${REGION_BACKEND}
+               SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-region-sync-${REGION_BACKEND}.cpp )
+
+target_include_directories(test-kernel-region-sync-${REGION_BACKEND}.exe
+                           PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+
+unset(REGION_BACKEND)
+
+endif()
diff --git a/test/functional/kernel/region/test-kernel-region-sync.cpp.in b/test/functional/kernel/region/test-kernel-region-sync.cpp.in
new file mode 100644
index 0000000000..ffd91d6fb0
--- /dev/null
+++ b/test/functional/kernel/region/test-kernel-region-sync.cpp.in
@@ -0,0 +1,94 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-kernel-region-data.hpp"
+#include "test-kernel-region-sync.hpp"
+
+
+//
+// Exec pols for kernel region tests
+//
+#if defined(RAJA_ENABLE_OPENMP)
+
+using OpenMPKernelRegionSyncExecPols =
+  camp::list<
+
+    RAJA::KernelPolicy<
+      RAJA::statement::Region<RAJA::omp_parallel_region,
+
+        RAJA::statement::For<0, RAJA::omp_for_nowait_exec,
+          RAJA::statement::Lambda<0, RAJA::Segs<0>>
+        >,
+
+        RAJA::statement::OmpSyncThreads,
+
+        RAJA::statement::For<1, RAJA::omp_for_nowait_exec,
+          RAJA::statement::Lambda<1, RAJA::Segs<1>>
+        >,
+
+        RAJA::statement::OmpSyncThreads,
+
+        RAJA::statement::For<0, RAJA::omp_for_nowait_exec,
+          RAJA::statement::Lambda<2, RAJA::Segs<0>>
+        >
+
+      >
+    >,
+
+    RAJA::KernelPolicy<
+      RAJA::statement::Region<RAJA::omp_parallel_region,
+
+        RAJA::statement::For<0, RAJA::omp_for_exec,
+          RAJA::statement::Lambda<0, RAJA::Segs<0>>
+        >,
+
+        RAJA::statement::OmpSyncThreads,
+
+        RAJA::statement::For<1, RAJA::omp_for_exec,
+          RAJA::statement::Lambda<1, RAJA::Segs<1>>
+        >,
+
+        RAJA::statement::OmpSyncThreads,
+
+        RAJA::statement::For<0, RAJA::omp_for_exec,
+          RAJA::statement::Lambda<2, RAJA::Segs<0>>
+        >
+
+      >
+    >
+
+  >;
+
+#endif // RAJA_ENABLE_OPENMP
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @REGION_BACKEND@KernelRegionSyncTypes =
+  Test< camp::cartesian_product<IdxTypeList,
+                                @REGION_BACKEND@ResourceList,
+                                @REGION_BACKEND@KernelRegionSyncExecPols>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@REGION_BACKEND@,
+                               KernelRegionSyncTest,
+                               @REGION_BACKEND@KernelRegionSyncTypes);
diff --git a/test/functional/kernel/region/test-kernel-region.cpp.in b/test/functional/kernel/region/test-kernel-region.cpp.in
new file mode 100644
index 0000000000..265b9706d4
--- /dev/null
+++ b/test/functional/kernel/region/test-kernel-region.cpp.in
@@ -0,0 +1,111 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-kernel-region-data.hpp"
+#include "test-kernel-region.hpp"
+
+
+//
+// Exec pols for kernel region tests
+//
+
+using SequentialKernelRegionExecPols =
+  camp::list<
+
+    RAJA::KernelPolicy<
+      RAJA::statement::Region<RAJA::seq_region,
+        RAJA::statement::For<0, RAJA::seq_exec,
+          RAJA::statement::Lambda<0>
+        >,
+        RAJA::statement::For<0, RAJA::seq_exec,
+          RAJA::statement::Lambda<1>
+        >,
+        RAJA::statement::For<0, RAJA::seq_exec,
+          RAJA::statement::Lambda<2>
+        >
+      >
+    >,
+
+    RAJA::KernelPolicy<
+      RAJA::statement::Region<RAJA::seq_region,
+        RAJA::statement::For<0, RAJA::loop_exec,
+          RAJA::statement::Lambda<0>
+        >,
+        RAJA::statement::For<0, RAJA::loop_exec,
+          RAJA::statement::Lambda<1>
+        >,
+        RAJA::statement::For<0, RAJA::loop_exec,
+          RAJA::statement::Lambda<2>
+        >
+      >
+    >
+
+  >;
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+using OpenMPKernelRegionExecPols =
+  camp::list<
+
+    RAJA::KernelPolicy<
+      RAJA::statement::Region<RAJA::omp_parallel_region,
+        RAJA::statement::For<0, RAJA::omp_for_nowait_exec,
+          RAJA::statement::Lambda<0>
+        >,
+        RAJA::statement::For<0, RAJA::omp_for_nowait_exec,
+          RAJA::statement::Lambda<1>
+        >,
+        RAJA::statement::For<0, RAJA::omp_for_nowait_exec,
+          RAJA::statement::Lambda<2>
+        >
+      >
+    >,
+
+    RAJA::KernelPolicy<
+      RAJA::statement::Region<RAJA::omp_parallel_region,
+        RAJA::statement::For<0, RAJA::omp_for_exec,
+          RAJA::statement::Lambda<0>
+        >,
+        RAJA::statement::For<0, RAJA::omp_for_exec,
+          RAJA::statement::Lambda<1>
+        >,
+        RAJA::statement::For<0, RAJA::omp_for_exec,
+          RAJA::statement::Lambda<2>
+        >
+      >
+    >
+
+  >;
+
+#endif  // RAJA_ENABLE_OPENMP
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @REGION_BACKEND@KernelRegionTypes =
+  Test< camp::cartesian_product<IdxTypeList,
+                                @REGION_BACKEND@ResourceList,
+                                @REGION_BACKEND@KernelRegionExecPols>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@REGION_BACKEND@,
+                               KernelRegionTest,
+                               @REGION_BACKEND@KernelRegionTypes);
diff --git a/test/functional/kernel/region/tests/test-kernel-region-data.hpp b/test/functional/kernel/region/tests/test-kernel-region-data.hpp
new file mode 100644
index 0000000000..e63bed9a61
--- /dev/null
+++ b/test/functional/kernel/region/tests/test-kernel-region-data.hpp
@@ -0,0 +1,38 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_KERNEL_REGION_DATA_HPP__
+#define __TEST_KERNEL_REGION_DATA_HPP__
+
+template <typename T>
+void allocRegionTestData(int N,
+                         camp::resources::Resource& work_res,
+                         T** work1, T** work2, T** work3,
+                         camp::resources::Resource& host_res,
+                         T** check)
+{
+  *work1 = work_res.allocate<T>(N);
+  *work2 = work_res.allocate<T>(N);
+  *work3 = work_res.allocate<T>(N);
+
+  *check = host_res.allocate<T>(N);
+}
+
+template <typename T>
+void deallocRegionTestData(camp::resources::Resource& work_res,
+                           T* work1, T* work2, T* work3,
+                           camp::resources::Resource& host_res,
+                           T* check)
+{
+  work_res.deallocate(work1);
+  work_res.deallocate(work2);
+  work_res.deallocate(work3);
+
+  host_res.deallocate(check);
+}
+
+#endif  // __TEST_KERNEL_REGION_UTILS_HPP__
diff --git a/test/functional/kernel/region/tests/test-kernel-region-sync.hpp b/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
new file mode 100644
index 0000000000..97a256cc54
--- /dev/null
+++ b/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
@@ -0,0 +1,108 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_KERNEL_REGION_SYNC_HPP__
+#define __TEST_KERNEL_REGION_SYNC_HPP__
+
+#include <algorithm>
+#include <numeric>
+#include <vector>
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+void KernelRegionSyncTestImpl(INDEX_TYPE first, INDEX_TYPE last)
+{
+  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
+
+  const INDEX_TYPE N = last - first;
+  
+  INDEX_TYPE* work_array1;
+  INDEX_TYPE* work_array2;
+  INDEX_TYPE* work_array3;
+
+  INDEX_TYPE* check_array;
+
+  allocRegionTestData(N,
+                      work_res,
+                      &work_array1, &work_array2, &work_array3,
+                      host_res,
+                      &check_array);
+
+  work_res.memset( work_array1, 0, sizeof(INDEX_TYPE) * N );
+  work_res.memset( work_array2, 0, sizeof(INDEX_TYPE) * N );
+  work_res.memset( work_array3, 0, sizeof(INDEX_TYPE) * N );
+
+  host_res.memset( check_array, 0, sizeof(INDEX_TYPE) * N );
+
+  //
+  // Create a list segment with indices in reverse order from range
+  // segment below. In the test kernel below, the first and third
+  // lambda expressions are run in loops using the range segment. The
+  // second lambda is run in a loop using the list segment. This makes
+  // it so that parallel threads must be synchronized between the loops.
+  //
+  std::vector<INDEX_TYPE> idx_array(N);
+  std::iota(idx_array.begin(), idx_array.end(), first);
+  std::reverse(idx_array.begin(), idx_array.end());
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], N,
+                                          work_res);
+
+  RAJA::TypedRangeSegment<INDEX_TYPE> rseg(first, last);
+
+  RAJA::kernel<EXEC_POLICY>(
+
+    RAJA::make_tuple(rseg, lseg),
+
+    [=] (INDEX_TYPE i) {
+      work_array1[i - first] = 50;
+    },
+
+    [=] (INDEX_TYPE i) {
+      work_array2[i - first] = 100;
+    },
+
+    [=] (INDEX_TYPE i) {
+      work_array3[i - first] = work_array1[i - first] + 
+                               work_array2[i - first] + 1;
+    }
+
+  );
+  
+  work_res.memcpy(check_array, work_array3, sizeof(INDEX_TYPE) * N);
+
+  for (INDEX_TYPE i = 0; i < N; i++) {
+    ASSERT_EQ(check_array[i], 151);
+  }
+
+  deallocRegionTestData(work_res,
+                        work_array1, work_array2, work_array3,
+                        host_res,
+                        check_array);
+}
+
+
+TYPED_TEST_SUITE_P(KernelRegionSyncTest);
+template <typename T>
+class KernelRegionSyncTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(KernelRegionSyncTest, RegionSyncKernel)
+{
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  KernelRegionSyncTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(0, 25);
+  KernelRegionSyncTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 153);
+  KernelRegionSyncTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(3, 2556);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(KernelRegionSyncTest,
+                            RegionSyncKernel);
+
+#endif  // __TEST_KERNEL_REGION_SYNC_HPP__
diff --git a/test/functional/kernel/region/tests/test-kernel-region.hpp b/test/functional/kernel/region/tests/test-kernel-region.hpp
new file mode 100644
index 0000000000..d0bb7aefe4
--- /dev/null
+++ b/test/functional/kernel/region/tests/test-kernel-region.hpp
@@ -0,0 +1,92 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_KERNEL_REGION_HPP__
+#define __TEST_KERNEL_REGION_HPP__
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+void KernelRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
+{
+  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
+
+  const INDEX_TYPE N = last - first;
+  
+  INDEX_TYPE* work_array1;
+  INDEX_TYPE* work_array2;
+  INDEX_TYPE* work_array3;
+
+  INDEX_TYPE* check_array;
+
+  allocRegionTestData(N,
+                      work_res,
+                      &work_array1, &work_array2, &work_array3,
+                      host_res,
+                      &check_array);
+
+  work_res.memset( work_array1, 0, sizeof(INDEX_TYPE) * N );
+  work_res.memset( work_array2, 0, sizeof(INDEX_TYPE) * N );
+  work_res.memset( work_array3, 0, sizeof(INDEX_TYPE) * N );
+
+  host_res.memset( check_array, 0, sizeof(INDEX_TYPE) * N );
+
+
+  RAJA::TypedRangeSegment<INDEX_TYPE> rseg(first, last);
+
+  RAJA::kernel<EXEC_POLICY>(
+
+    RAJA::make_tuple(rseg),
+
+    [=] (INDEX_TYPE i) {
+      work_array1[i - first] = 50;
+    },
+
+    [=] (INDEX_TYPE i) {
+      work_array2[i - first] = 100;
+    },
+
+    [=] (INDEX_TYPE i) {
+      work_array3[i - first] = work_array1[i - first] + 
+                               work_array2[i - first] + 1;
+    }
+
+  );
+  
+  work_res.memcpy(check_array, work_array3, sizeof(INDEX_TYPE) * N );
+
+  for (INDEX_TYPE i = 0; i < N; i++) {
+    ASSERT_EQ(check_array[i], 151);
+  }
+
+  deallocRegionTestData(work_res,
+                        work_array1, work_array2, work_array3,
+                        host_res,
+                        check_array);
+}
+
+
+TYPED_TEST_SUITE_P(KernelRegionTest);
+template <typename T>
+class KernelRegionTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(KernelRegionTest, RegionKernel)
+{
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  KernelRegionTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(0, 25);
+  KernelRegionTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 153);
+  KernelRegionTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(3, 2556);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(KernelRegionTest,
+                            RegionKernel);
+
+#endif  // __TEST_KERNEL_REGION_HPP__
diff --git a/test/functional/scan/CMakeLists.txt b/test/functional/scan/CMakeLists.txt
new file mode 100644
index 0000000000..2042a1270a
--- /dev/null
+++ b/test/functional/scan/CMakeLists.txt
@@ -0,0 +1,46 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+list(APPEND SCAN_BACKENDS Sequential)
+
+if(RAJA_ENABLE_OPENMP)
+  list(APPEND SCAN_BACKENDS OpenMP)
+endif()
+
+if(RAJA_ENABLE_TBB)
+  list(APPEND SCAN_BACKENDS TBB)
+endif()
+
+if(RAJA_ENABLE_CUDA)
+  list(APPEND SCAN_BACKENDS Cuda)
+endif()
+
+if(RAJA_ENABLE_HIP)
+  list(APPEND SCAN_BACKENDS Hip)
+endif()
+
+
+set(SCAN_TYPES Exclusive ExclusiveInplace Inclusive InclusiveInplace)
+
+#
+# Generate scan tests for each enabled RAJA back-end.
+#
+foreach( SCAN_BACKEND ${SCAN_BACKENDS} )
+  foreach( SCAN_TYPE ${SCAN_TYPES} )
+    configure_file( test-scan.cpp.in
+                    test-${SCAN_TYPE}-scan-${SCAN_BACKEND}.cpp )
+    raja_add_test( NAME test-${SCAN_TYPE}-scan-${SCAN_BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-${SCAN_TYPE}-scan-${SCAN_BACKEND}.cpp )
+
+    target_include_directories(test-${SCAN_TYPE}-scan-${SCAN_BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+
+  endforeach()
+endforeach()
+
+unset( SCAN_TYPES )
+unset( SCAN_BACKENDS )
diff --git a/test/functional/scan/test-scan.cpp.in b/test/functional/scan/test-scan.cpp.in
new file mode 100644
index 0000000000..e0f85141bd
--- /dev/null
+++ b/test/functional/scan/test-scan.cpp.in
@@ -0,0 +1,55 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+
+#include "RAJA_test-forall-execpol.hpp"
+
+// 
+// Define scan operation types
+//
+using ScanOpTypes = camp::list< RAJA::operators::plus<int>,
+#if 0  // Parallel tests with plus operator and float data do not work
+       // likely due to precision being too low and plus not associative
+                                RAJA::operators::plus<float>,
+#endif
+                                RAJA::operators::plus<double>,
+                                RAJA::operators::minimum<int>,
+                                RAJA::operators::minimum<float>,
+                                RAJA::operators::minimum<double>,
+                                RAJA::operators::maximum<int>,
+                                RAJA::operators::maximum<float>,
+                                RAJA::operators::maximum<double> >;
+
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-scan-data.hpp"
+#include "test-scan-@SCAN_TYPE@.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @SCAN_BACKEND@@SCAN_TYPE@ScanTypes =
+  Test< camp::cartesian_product< @SCAN_BACKEND@ForallExecPols,
+                                 @SCAN_BACKEND@ResourceList,
+                                 ScanOpTypes >>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@SCAN_BACKEND@,
+                               Scan@SCAN_TYPE@Test,
+                               @SCAN_BACKEND@@SCAN_TYPE@ScanTypes);
diff --git a/test/functional/scan/tests/test-scan-Exclusive.hpp b/test/functional/scan/tests/test-scan-Exclusive.hpp
new file mode 100644
index 0000000000..273de38dde
--- /dev/null
+++ b/test/functional/scan/tests/test-scan-Exclusive.hpp
@@ -0,0 +1,111 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_SCAN_EXCLUSIVE_HPP__
+#define __TEST_SCAN_EXCLUSIVE_HPP__
+
+#include <numeric>
+
+template <typename OP, typename T>
+::testing::AssertionResult check_exclusive(const T* actual,
+                                           const T* original,
+                                           int N,
+                                           T init = OP::identity())
+{
+  for (int i = 0; i < N; ++i) {
+    if (*actual != init) {
+      return ::testing::AssertionFailure()
+             << *actual << " != " << init << " (at index " << i << ")";
+    }
+    init = OP()(init, *original);
+    ++actual;
+    ++original;
+  }
+  return ::testing::AssertionSuccess();
+}
+
+template <typename EXEC_POLICY, typename WORKING_RES, typename OP_TYPE>
+void ScanExclusiveTestImpl(int N,
+                           typename OP_TYPE::result_type offset = 
+                           OP_TYPE::identity())
+{
+  using T = typename OP_TYPE::result_type;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  T* work_in;
+  T* work_out;
+  T* host_in;
+  T* host_out;
+
+  allocScanTestData(N, 
+                    working_res,
+                    &work_in, &work_out, 
+                    &host_in, &host_out);
+
+  std::iota(host_in, host_in + N, 1);
+
+  working_res.memcpy(work_in, host_in, sizeof(T) * N);
+
+  RAJA::exclusive_scan<EXEC_POLICY>(work_in,
+                                    work_in + N,
+                                    work_out,
+                                    OP_TYPE{},
+                                    offset);
+
+  working_res.memcpy(host_out, work_out, sizeof(T) * N);
+
+  ASSERT_TRUE(check_exclusive<OP_TYPE>(host_out, host_in, N, offset));
+
+  deallocScanTestData(working_res,
+                      work_in, work_out,             
+                      host_in, host_out);
+}
+
+
+TYPED_TEST_SUITE_P(ScanExclusiveTest);
+template <typename T>
+class ScanExclusiveTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ScanExclusiveTest, ScanExclusive)
+{
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ScanExclusiveTestImpl<EXEC_POLICY, 
+                              WORKING_RESOURCE, 
+                              OP_TYPE>(0);
+  ScanExclusiveTestImpl<EXEC_POLICY, 
+                              WORKING_RESOURCE, 
+                              OP_TYPE>(357);
+  ScanExclusiveTestImpl<EXEC_POLICY, 
+                              WORKING_RESOURCE, 
+                              OP_TYPE>(32000);
+
+  //
+  // Perform some non-identity offset tests
+  // 
+  using T = typename OP_TYPE::result_type;
+
+  ScanExclusiveTestImpl<EXEC_POLICY, 
+                        WORKING_RESOURCE, 
+                        OP_TYPE>(0, T(13));
+  ScanExclusiveTestImpl<EXEC_POLICY, 
+                        WORKING_RESOURCE, 
+                        OP_TYPE>(357, T(15));
+  ScanExclusiveTestImpl<EXEC_POLICY, 
+                        WORKING_RESOURCE, 
+                        OP_TYPE>(32000, T(2));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveTest, 
+                            ScanExclusive);
+
+#endif // __TEST_SCAN_EXCLUSIVE_HPP__
diff --git a/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp b/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
new file mode 100644
index 0000000000..45d74176b6
--- /dev/null
+++ b/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
@@ -0,0 +1,110 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_SCAN_EXCLUSIVE_INPLACE_HPP__
+#define __TEST_SCAN_EXCLUSIVE_INPLACE_HPP__
+
+#include <numeric>
+
+template <typename OP, typename T>
+::testing::AssertionResult check_exclusive(const T* actual,
+                                           const T* original,
+                                           int N,
+                                           T init = OP::identity())
+{
+  for (int i = 0; i < N; ++i) {
+    if (*actual != init) {
+      return ::testing::AssertionFailure()
+             << *actual << " != " << init << " (at index " << i << ")";
+    }
+    init = OP()(init, *original);
+    ++actual;
+    ++original;
+  }
+  return ::testing::AssertionSuccess();
+}
+
+template <typename EXEC_POLICY, typename WORKING_RES, typename OP_TYPE>
+void ScanExclusiveInplaceTestImpl(int N,
+                                  typename OP_TYPE::result_type offset =
+                                  OP_TYPE::identity())
+{
+  using T = typename OP_TYPE::result_type;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  T* work_in;
+  T* work_out;
+  T* host_in;
+  T* host_out;
+
+  allocScanTestData(N,             
+                    working_res,
+                    &work_in, &work_out,             
+                    &host_in, &host_out);
+
+  std::iota(host_in, host_in + N, 1);
+
+  working_res.memcpy(work_in, host_in, sizeof(T) * N);
+
+  RAJA::exclusive_scan_inplace<EXEC_POLICY>(work_in,
+                                            work_in + N,
+                                            OP_TYPE{},
+                                            offset);
+
+  working_res.memcpy(host_out, work_in, sizeof(T) * N);
+
+  ASSERT_TRUE(check_exclusive<OP_TYPE>(host_out, host_in, N, offset));
+
+  deallocScanTestData(working_res,
+                      work_in, work_out,
+                      host_in, host_out);
+}
+
+
+TYPED_TEST_SUITE_P(ScanExclusiveInplaceTest);
+template <typename T>
+class ScanExclusiveInplaceTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ScanExclusiveInplaceTest, ScanExclusiveInplace)
+{
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, 
+                               WORKING_RESOURCE,
+                               OP_TYPE>(0);
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, 
+                               WORKING_RESOURCE,
+                               OP_TYPE>(357);
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
+                               WORKING_RESOURCE,
+                               OP_TYPE>(32000);
+
+  //
+  // Perform some non-identity offset tests
+  //
+  using T = typename OP_TYPE::result_type;
+
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
+                               WORKING_RESOURCE,
+                               OP_TYPE>(0, T(13));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
+                               WORKING_RESOURCE,
+                               OP_TYPE>(357, T(15));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
+                               WORKING_RESOURCE,
+                               OP_TYPE>(32000, T(2));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveInplaceTest, 
+                            ScanExclusiveInplace);
+
+#endif // __TEST_SCAN_EXCLUSIVE_INPLACE_HPP__
diff --git a/test/functional/scan/tests/test-scan-Inclusive.hpp b/test/functional/scan/tests/test-scan-Inclusive.hpp
new file mode 100644
index 0000000000..fd4c8cc794
--- /dev/null
+++ b/test/functional/scan/tests/test-scan-Inclusive.hpp
@@ -0,0 +1,94 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_SCAN_INCLUSIVE_HPP__
+#define __TEST_SCAN_INCLUSIVE_HPP__
+
+#include <numeric>
+
+template <typename OP>
+::testing::AssertionResult check_inclusive(
+  const typename OP::result_type* actual,
+  const typename OP::result_type* original,
+  int N)
+{
+  typename OP::result_type init = OP::identity();
+  for (int i = 0; i < N; ++i) {
+    init = OP()(init, *original);
+    if (*actual != init) {
+      return ::testing::AssertionFailure()
+             << *actual << " != " << init << " (at index " << i << ")";
+    }
+    ++actual;
+    ++original;
+  }
+  return ::testing::AssertionSuccess();
+}
+
+template <typename EXEC_POLICY, typename WORKING_RES, typename OP_TYPE>
+void ScanInclusiveTestImpl(int N)
+{
+  using T = typename OP_TYPE::result_type;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  T* work_in;
+  T* work_out;
+  T* host_in;
+  T* host_out;
+
+  allocScanTestData(N, 
+                    working_res,
+                    &work_in, &work_out, 
+                    &host_in, &host_out);
+
+  std::iota(host_in, host_in + N, 1);
+
+  working_res.memcpy(work_in, host_in, sizeof(T) * N);
+
+  RAJA::inclusive_scan<EXEC_POLICY>(work_in,
+                                    work_in + N,
+                                    work_out,
+                                    OP_TYPE{});
+
+  working_res.memcpy(host_out, work_out, sizeof(T) * N);
+
+  ASSERT_TRUE(check_inclusive<OP_TYPE>(host_out, host_in, N));
+
+  deallocScanTestData(working_res,
+                      work_in, work_out,             
+                      host_in, host_out);
+}
+
+
+TYPED_TEST_SUITE_P(ScanInclusiveTest);
+template <typename T>
+class ScanInclusiveTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ScanInclusiveTest, ScanInclusive)
+{
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ScanInclusiveTestImpl<EXEC_POLICY, 
+                        WORKING_RESOURCE, 
+                        OP_TYPE>(0);
+  ScanInclusiveTestImpl<EXEC_POLICY, 
+                        WORKING_RESOURCE, 
+                        OP_TYPE>(357);
+  ScanInclusiveTestImpl<EXEC_POLICY, 
+                        WORKING_RESOURCE, 
+                        OP_TYPE>(32000);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveTest, 
+                            ScanInclusive);
+
+#endif // __TEST_SCAN_INCLUSIVE_HPP__
diff --git a/test/functional/scan/tests/test-scan-InclusiveInplace.hpp b/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
new file mode 100644
index 0000000000..5ceb56c9dd
--- /dev/null
+++ b/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
@@ -0,0 +1,93 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_SCAN_INCLUSIVE_INPLACE_HPP__
+#define __TEST_SCAN_INCLUSIVE_INPLACE_HPP__
+
+#include <numeric>
+
+template <typename OP>
+::testing::AssertionResult check_inclusive(
+  const typename OP::result_type* actual,
+  const typename OP::result_type* original,
+  int N)
+{
+  typename OP::result_type init = OP::identity();
+  for (int i = 0; i < N; ++i) {
+    init = OP()(init, *original);
+    if (*actual != init) {
+      return ::testing::AssertionFailure()
+             << *actual << " != " << init << " (at index " << i << ")";
+    }
+    ++actual;
+    ++original;
+  }
+  return ::testing::AssertionSuccess();
+}
+
+template <typename EXEC_POLICY, typename WORKING_RES, typename OP_TYPE>
+void ScanInclusiveInplaceTestImpl(int N)
+{
+  using T = typename OP_TYPE::result_type;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  T* work_in;
+  T* work_out;
+  T* host_in;
+  T* host_out;
+
+  allocScanTestData(N,             
+                    working_res,
+                    &work_in, &work_out,             
+                    &host_in, &host_out);
+
+  std::iota(host_in, host_in + N, 1);
+
+  working_res.memcpy(work_in, host_in, sizeof(T) * N);
+
+  RAJA::inclusive_scan_inplace<EXEC_POLICY>(work_in,
+                                            work_in + N,
+                                            OP_TYPE{});
+
+  working_res.memcpy(host_out, work_in, sizeof(T) * N);
+
+  ASSERT_TRUE(check_inclusive<OP_TYPE>(host_out, host_in, N));
+
+  deallocScanTestData(working_res,
+                      work_in, work_out,
+                      host_in, host_out);
+}
+
+
+TYPED_TEST_SUITE_P(ScanInclusiveInplaceTest);
+template <typename T>
+class ScanInclusiveInplaceTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ScanInclusiveInplaceTest, ScanInclusiveInplace)
+{
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ScanInclusiveInplaceTestImpl<EXEC_POLICY, 
+                               WORKING_RESOURCE,
+                               OP_TYPE>(0);
+  ScanInclusiveInplaceTestImpl<EXEC_POLICY, 
+                               WORKING_RESOURCE,
+                               OP_TYPE>(357);
+  ScanInclusiveInplaceTestImpl<EXEC_POLICY,
+                               WORKING_RESOURCE,
+                               OP_TYPE>(32000);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveInplaceTest, 
+                            ScanInclusiveInplace);
+
+#endif // __TEST_SCAN_INCLUSIVE_INPLACE_HPP__
diff --git a/test/functional/scan/tests/test-scan-data.hpp b/test/functional/scan/tests/test-scan-data.hpp
new file mode 100644
index 0000000000..9bdc501fac
--- /dev/null
+++ b/test/functional/scan/tests/test-scan-data.hpp
@@ -0,0 +1,43 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_SCAN_DATA_HPP__
+#define __TEST_SCAN_DATA_HPP__
+
+//
+// Methods to allocate/deallocate scan test data.
+//
+
+template <typename T>
+void allocScanTestData(int N,
+                       camp::resources::Resource& work_res,
+                       T** work_in, T** work_out,
+                       T** host_in, T** host_out)
+{
+  camp::resources::Resource host_res{camp::resources::Host()};
+
+  *work_in  = work_res.allocate<T>(N);
+  *work_out = work_res.allocate<T>(N);
+
+  *host_in  = host_res.allocate<T>(N);
+  *host_out = host_res.allocate<T>(N);
+}
+
+template <typename T>
+void deallocScanTestData(camp::resources::Resource& work_res,
+                         T* work_in, T* work_out,
+                         T* host_in, T* host_out)
+{
+  camp::resources::Resource host_res{camp::resources::Host()};
+
+  work_res.deallocate(work_in);
+  work_res.deallocate(work_out);
+  host_res.deallocate(host_in);
+  host_res.deallocate(host_out);
+}
+
+#endif // __TEST_SCAN_DATA_HPP__
diff --git a/test/functional/teams/CMakeLists.txt b/test/functional/teams/CMakeLists.txt
new file mode 100644
index 0000000000..c79ae2dda2
--- /dev/null
+++ b/test/functional/teams/CMakeLists.txt
@@ -0,0 +1,44 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of segment types for generating test files.
+#
+set(TEST_TYPES BasicShared)
+
+
+#
+# Generate tests for each enabled RAJA back-end.
+#
+#
+list(APPEND TEAMS_BACKENDS Sequential)
+
+if(RAJA_ENABLE_OPENMP)
+  list(APPEND TEAMS_BACKENDS OpenMP)
+endif()
+
+if(RAJA_ENABLE_CUDA)
+  list(APPEND TEAMS_BACKENDS Cuda)
+endif()
+
+if(RAJA_ENABLE_HIP)
+  list(APPEND FORALL_BACKENDS Hip)
+endif()
+
+foreach( BACKEND ${TEAMS_BACKENDS} )
+  foreach( TESTTYPE ${TEST_TYPES} )
+    configure_file( test-teams.cpp.in
+                    test-teams-${TESTTYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-teams-${TESTTYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-teams-${TESTTYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-teams-${TESTTYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( TEST_TYPES )
diff --git a/test/functional/teams/test-teams.cpp.in b/test/functional/teams/test-teams.cpp.in
new file mode 100644
index 0000000000..6982c15ae2
--- /dev/null
+++ b/test/functional/teams/test-teams.cpp.in
@@ -0,0 +1,38 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+
+#include "RAJA_test-teams-execpol.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-teams-@TESTTYPE@.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@TeamsTypes =
+  Test< camp::cartesian_product<@BACKEND@ResourceList,
+                                @BACKEND@_launch_policies>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               Teams@TESTTYPE@Test,
+                               @BACKEND@TeamsTypes);
diff --git a/test/functional/teams/tests/test-teams-BasicShared.hpp b/test/functional/teams/tests/test-teams-BasicShared.hpp
new file mode 100644
index 0000000000..f2d0792c37
--- /dev/null
+++ b/test/functional/teams/tests/test-teams-BasicShared.hpp
@@ -0,0 +1,104 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_TEAMS_BASIC_SHARED_HPP__
+#define __TEST_TEAMS_BASIC_SHARED_HPP__
+
+#include <numeric>
+
+template <typename WORKING_RES, typename LAUNCH_POLICY, typename TEAM_POLICY, typename THREAD_POLICY>
+void TeamsBasicSharedTestImpl()
+{
+
+  int N = 100;
+
+  camp::resources::Resource working_res{WORKING_RES()};
+  int* working_array;
+  int* check_array;
+  int* test_array;
+
+  allocateForallTestData<int>(N*N,
+                             working_res,
+                             &working_array,
+                             &check_array,
+                             &test_array);
+  
+
+
+  //Select platform
+  RAJA::expt::ExecPlace select_cpu_or_gpu;
+  if (working_res.get_platform()  == camp::resources::Platform::host){
+    select_cpu_or_gpu = RAJA::expt::HOST;
+  }else{  
+    select_cpu_or_gpu = RAJA::expt::DEVICE;
+  }
+
+
+  RAJA::expt::launch<LAUNCH_POLICY>(select_cpu_or_gpu,
+    RAJA::expt::Resources(RAJA::expt::Teams(N), RAJA::expt::Threads(N)),
+        [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
+
+          RAJA::expt::loop<TEAM_POLICY>(ctx, RAJA::RangeSegment(0, N), [&](int r) {
+
+                // Array shared within threads of the same team
+                TEAM_SHARED int s_A[1];
+
+                RAJA::expt::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, 1), [&](int c) {
+                    s_A[c] = r; 
+                });
+
+                ctx.teamSync();
+
+                //broadcast shared value to all threads and write to array
+                RAJA::expt::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, N), [&](int c) {
+                    const int idx = c + N*r;
+                    working_array[idx] = s_A[0];
+                });  // loop j
+
+              });  // loop r
+        });  // outer lambda
+
+
+
+  working_res.memcpy(check_array, working_array, sizeof(int) * N*N);
+
+  for(int r = 0; r < N; ++r) {
+    for (int c = 0; c < N; c++) {
+      ASSERT_EQ(r, check_array[c + r*N]);
+    }
+  }
+
+  deallocateForallTestData<int>(working_res,
+                               working_array,
+                               check_array,
+                               test_array);
+}
+
+
+TYPED_TEST_SUITE_P(TeamsBasicSharedTest);
+template <typename T>
+class TeamsBasicSharedTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(TeamsBasicSharedTest, BasicSharedTeams)
+{
+
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<1>>::type, camp::num<0>>::type;
+  using TEAM_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<1>>::type, camp::num<1>>::type;
+  using THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<1>>::type, camp::num<2>>::type;
+
+  TeamsBasicSharedTestImpl<WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>();
+
+
+}
+
+REGISTER_TYPED_TEST_SUITE_P(TeamsBasicSharedTest,
+                            BasicSharedTeams);
+
+#endif  // __TEST_BASIC_SHARED_HPP__
diff --git a/test/functional/workgroup/CMakeLists.txt b/test/functional/workgroup/CMakeLists.txt
new file mode 100644
index 0000000000..69940f2389
--- /dev/null
+++ b/test/functional/workgroup/CMakeLists.txt
@@ -0,0 +1,63 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# macro that generates test file and build target for each
+# sub-test and backend
+# this must be a macro or the linker variable set by FindHIP won't be set in
+# the right scope and linking will fail with a weird error from
+# hipcc_cmake_linker_helper because it expects the path to hipcc as the first
+# argument
+#
+macro( buildunitworkgrouptest TESTNAME SUBTESTNAMES BACKENDS )
+  foreach( BACKEND ${BACKENDS} )
+    foreach( SUBTESTNAME ${SUBTESTNAMES} )
+      configure_file( test-workgroup-${TESTNAME}.cpp.in
+                      test-workgroup-${TESTNAME}-${SUBTESTNAME}-${BACKEND}.cpp )
+      raja_add_test( NAME test-workgroup-${TESTNAME}-${SUBTESTNAME}-${BACKEND}
+                     SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-workgroup-${TESTNAME}-${SUBTESTNAME}-${BACKEND}.cpp )
+
+      target_include_directories(test-workgroup-${TESTNAME}-${SUBTESTNAME}-${BACKEND}.exe
+                                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+    endforeach()
+  endforeach()
+endmacro()
+
+
+set(BACKENDS Sequential)
+
+if(RAJA_ENABLE_TBB)
+  list(APPEND BACKENDS TBB)
+endif()
+
+if(RAJA_ENABLE_OPENMP)
+  list(APPEND BACKENDS OpenMP)
+endif()
+
+if(RAJA_ENABLE_TARGET_OPENMP)
+  list(APPEND BACKENDS OpenMPTarget)
+endif()
+
+if(RAJA_ENABLE_CUDA)
+  list(APPEND BACKENDS Cuda)
+endif()
+
+if(RAJA_ENABLE_HIP)
+  list(APPEND BACKENDS Hip)
+endif()
+
+
+set(Ordered_SUBTESTS Single MultipleReuse)
+buildunitworkgrouptest(Ordered "${Ordered_SUBTESTS}" "${BACKENDS}")
+
+set(Unordered_SUBTESTS Single MultipleReuse)
+buildunitworkgrouptest(Unordered "${Unordered_SUBTESTS}" "${BACKENDS}")
+
+unset(BACKENDS)
+
+unset(Ordered_SUBTESTS)
+unset(Unordered_SUBTESTS)
diff --git a/test/functional/workgroup/test-workgroup-Ordered.cpp.in b/test/functional/workgroup/test-workgroup-Ordered.cpp.in
new file mode 100644
index 0000000000..9b7ef19f5d
--- /dev/null
+++ b/test/functional/workgroup/test-workgroup-Ordered.cpp.in
@@ -0,0 +1,27 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA workgroup ordered execution.
+///
+
+#include "test-workgroup-Ordered.hpp"
+
+using @BACKEND@BasicWorkGroupOrdered@SUBTESTNAME@Types =
+  Test< camp::cartesian_product< @BACKEND@ExecPolicyList,
+                                 @BACKEND@OrderedPolicyList,
+                                 @BACKEND@StoragePolicyList,
+                                 IndexTypeTypeList,
+                                 @BACKEND@AllocatorList,
+                                 @BACKEND@ResourceList > >::Types;
+
+REGISTER_TYPED_TEST_SUITE_P(WorkGroupBasicOrdered@SUBTESTNAME@FunctionalTest,
+                            BasicWorkGroupOrdered@SUBTESTNAME@);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@BasicTest,
+                               WorkGroupBasicOrdered@SUBTESTNAME@FunctionalTest,
+                               @BACKEND@BasicWorkGroupOrdered@SUBTESTNAME@Types);
diff --git a/test/functional/workgroup/test-workgroup-Unordered.cpp.in b/test/functional/workgroup/test-workgroup-Unordered.cpp.in
new file mode 100644
index 0000000000..6ac23299c6
--- /dev/null
+++ b/test/functional/workgroup/test-workgroup-Unordered.cpp.in
@@ -0,0 +1,27 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA workgroup unordered execution.
+///
+
+#include "test-workgroup-Unordered.hpp"
+
+using @BACKEND@BasicWorkGroupUnordered@SUBTESTNAME@Types =
+  Test< camp::cartesian_product< @BACKEND@ExecPolicyList,
+                                 @BACKEND@OrderPolicyList,
+                                 @BACKEND@StoragePolicyList,
+                                 IndexTypeTypeList,
+                                 @BACKEND@AllocatorList,
+                                 @BACKEND@ResourceList > >::Types;
+
+REGISTER_TYPED_TEST_SUITE_P(WorkGroupBasicUnordered@SUBTESTNAME@FunctionalTest,
+                            BasicWorkGroupUnordered@SUBTESTNAME@);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@BasicTest,
+                               WorkGroupBasicUnordered@SUBTESTNAME@FunctionalTest,
+                               @BACKEND@BasicWorkGroupUnordered@SUBTESTNAME@Types);
diff --git a/test/functional/workgroup/tests/test-workgroup-Ordered.hpp b/test/functional/workgroup/tests/test-workgroup-Ordered.hpp
new file mode 100644
index 0000000000..5fe6d754fc
--- /dev/null
+++ b/test/functional/workgroup/tests/test-workgroup-Ordered.hpp
@@ -0,0 +1,467 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing tests for RAJA workgroup ordered runs.
+///
+
+#ifndef __TEST_WORKGROUP_ORDERED__
+#define __TEST_WORKGROUP_ORDERED__
+
+#include "RAJA_test-workgroup.hpp"
+#include "RAJA_test-forall-data.hpp"
+
+#include <random>
+#include <vector>
+
+
+template <typename ExecPolicy,
+          typename OrderPolicy,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES
+          >
+void testWorkGroupOrderedSingle(IndexType begin, IndexType end)
+{
+  using WorkPool_type = RAJA::WorkPool<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkGroup_type = RAJA::WorkGroup<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkSite_type = RAJA::WorkSite<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  ASSERT_GE(begin, (IndexType)0);
+  ASSERT_GE(end, begin);
+  IndexType N = end + begin;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  IndexType* working_array;
+  IndexType* check_array;
+  IndexType* test_array;
+
+  allocateForallTestData<IndexType>(N,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+
+  {
+    for (IndexType i = IndexType(0); i < N; i++) {
+      test_array[i] = IndexType(0);
+    }
+
+    working_res.memcpy(working_array, test_array, sizeof(IndexType) * N);
+
+    for (IndexType i = begin; i < end; ++i) {
+      test_array[ i ] = IndexType(i);
+    }
+  }
+
+  WorkPool_type pool(Allocator{});
+
+  IndexType test_val(5);
+
+  {
+    pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin, end },
+        [=] RAJA_HOST_DEVICE (IndexType i) {
+      working_array[i] += i;
+    });
+    pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin, end },
+        [=] RAJA_HOST_DEVICE (IndexType i) {
+      working_array[i] += test_val;
+    });
+  }
+
+  WorkGroup_type group = pool.instantiate();
+
+  WorkSite_type site = group.run();
+
+  {
+    working_res.memcpy(check_array, working_array, sizeof(IndexType) * N);
+
+    for (IndexType i = IndexType(0); i < begin; i++) {
+      ASSERT_EQ(test_array[i], check_array[i]);
+    }
+    for (IndexType i = begin;        i < end;   i++) {
+      ASSERT_EQ(test_array[i] + test_val, check_array[i]);
+    }
+    for (IndexType i = end;          i < N;     i++) {
+      ASSERT_EQ(test_array[i], check_array[i]);
+    }
+  }
+
+
+  deallocateForallTestData<IndexType>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+
+template <typename ExecPolicy,
+          typename OrderPolicy,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES
+          >
+void testWorkGroupOrderedMultiple(
+    std::mt19937& rng, IndexType max_begin, IndexType min_end,
+    IndexType num1, IndexType num2, IndexType num3,
+    IndexType pool_reuse, IndexType group_reuse)
+{
+  using WorkPool_type = RAJA::WorkPool<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkGroup_type = RAJA::WorkGroup<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkSite_type = RAJA::WorkSite<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  ASSERT_GT(min_end, max_begin);
+  IndexType N = min_end + max_begin;
+
+  std::vector<IndexType> begin1, end1;
+  std::vector<IndexType> begin2, end2;
+  std::vector<IndexType> begin3, end3;
+
+  {
+    using dist_type = std::uniform_int_distribution<IndexType>;
+
+    for (IndexType j = IndexType(0); j < num1; j++) {
+      begin1.push_back(dist_type(max_begin, min_end-1)(rng));
+      end1.push_back(dist_type(begin1.back(), min_end)(rng));
+    }
+
+    for (IndexType j = IndexType(0); j < num2; j++) {
+      begin2.push_back(dist_type(max_begin, min_end-1)(rng));
+      end2.push_back(dist_type(begin2.back(), min_end)(rng));
+    }
+
+    for (IndexType j = IndexType(0); j < num3; j++) {
+      begin3.push_back(dist_type(max_begin, min_end-1)(rng));
+      end3.push_back(dist_type(begin3.back(), min_end)(rng));
+    }
+  }
+
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  using type1 = IndexType;
+  using type2 = size_t;
+  using type3 = double;
+
+  type1* working_array1 = nullptr;
+  type1* check_array1 = nullptr;
+  type1* test_array1 = nullptr;
+
+  type2* working_array2 = nullptr;
+  type2* check_array2 = nullptr;
+  type2* test_array2 = nullptr;
+
+  type3* working_array3 = nullptr;
+  type3* check_array3 = nullptr;
+  type3* test_array3 = nullptr;
+
+  allocateForallTestData<type1>(N * num1,
+                                working_res,
+                                &working_array1,
+                                &check_array1,
+                                &test_array1);
+
+  allocateForallTestData<type2>(N * num2,
+                                working_res,
+                                &working_array2,
+                                &check_array2,
+                                &test_array2);
+
+  allocateForallTestData<type3>(N * num3,
+                                working_res,
+                                &working_array3,
+                                &check_array3,
+                                &test_array3);
+
+
+  WorkPool_type pool(Allocator{});
+  WorkGroup_type group = pool.instantiate();
+  WorkSite_type site = group.run();
+
+  for (IndexType pr = 0; pr < pool_reuse; pr++) {
+
+    type1 test_val1(5);
+    type2 test_val2(7);
+    type3 test_val3(11);
+
+    // fill_pool(pool, type1(5), type2(7), type3(11));
+    {
+      for (IndexType j = IndexType(0); j < num1; j++) {
+        type1* working_ptr1 = working_array1 + N * j;
+        pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin1[j], end1[j] },
+            [=] RAJA_HOST_DEVICE (IndexType i) {
+          working_ptr1[i] += type1(i);
+        });
+        pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin1[j], end1[j] },
+            [=] RAJA_HOST_DEVICE (IndexType i) {
+          working_ptr1[i] += test_val1;
+        });
+      }
+
+      for (IndexType j = IndexType(0); j < num2; j++) {
+        type2* working_ptr2 = working_array2 + N * j;
+        pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin2[j], end2[j] },
+            [=] RAJA_HOST_DEVICE (IndexType i) {
+          working_ptr2[i] += type2(i);
+        });
+        pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin2[j], end2[j] },
+            [=] RAJA_HOST_DEVICE (IndexType i) {
+          working_ptr2[i] += test_val2;
+        });
+      }
+
+      for (IndexType j = IndexType(0); j < num3; j++) {
+        type3* working_ptr3 = working_array3 + N * j;
+        pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin3[j], end3[j] },
+            [=] RAJA_HOST_DEVICE (IndexType i) {
+          working_ptr3[i] += type3(i);
+        });
+        pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin3[j], end3[j] },
+            [=] RAJA_HOST_DEVICE (IndexType i) {
+          working_ptr3[i] += test_val3;
+        });
+      }
+    }
+
+    group = pool.instantiate();
+
+    for (IndexType gr = 0; gr < group_reuse; gr++) {
+
+      // set_test_data();
+      {
+        for (IndexType j = IndexType(0); j < num1; j++) {
+          type1* test_ptr1 = test_array1 + N * j;
+          for (IndexType i = IndexType(0); i < N; i++) {
+            test_ptr1[i] = type1(0);
+          }
+        }
+
+        for (IndexType j = IndexType(0); j < num2; j++) {
+          type2* test_ptr2 = test_array2 + N * j;
+          for (IndexType i = IndexType(0); i < N; i++) {
+            test_ptr2[i] = type2(0);
+          }
+        }
+
+        for (IndexType j = IndexType(0); j < num3; j++) {
+          type3* test_ptr3 = test_array3 + N * j;
+          for (IndexType i = IndexType(0); i < N; i++) {
+            test_ptr3[i] = type3(0);
+          }
+        }
+
+
+        working_res.memcpy(working_array1, test_array1, sizeof(type1) * N * num1);
+
+        working_res.memcpy(working_array2, test_array2, sizeof(type2) * N * num2);
+
+        working_res.memcpy(working_array3, test_array3, sizeof(type3) * N * num3);
+
+
+        for (IndexType j = IndexType(0); j < num1; j++) {
+          type1* test_ptr1 = test_array1 + N * j;
+          for (IndexType i = begin1[j]; i < end1[j]; ++i) {
+            test_ptr1[ i ] = type1(i);
+          }
+        }
+
+        for (IndexType j = IndexType(0); j < num2; j++) {
+          type2* test_ptr2 = test_array2 + N * j;
+          for (IndexType i = begin2[j]; i < end2[j]; ++i) {
+            test_ptr2[ i ] = type2(i);
+          }
+        }
+
+        for (IndexType j = IndexType(0); j < num3; j++) {
+          type3* test_ptr3 = test_array3 + N * j;
+          for (IndexType i = begin3[j]; i < end3[j]; ++i) {
+            test_ptr3[ i ] = type3(i);
+          }
+        }
+      }
+
+      site = group.run();
+
+      // check_test_data(type1(5), type2(7), type3(11));
+      {
+        working_res.memcpy(check_array1, working_array1, sizeof(type1) * N * num1);
+
+        working_res.memcpy(check_array2, working_array2, sizeof(type2) * N * num2);
+
+        working_res.memcpy(check_array3, working_array3, sizeof(type3) * N * num3);
+
+
+        for (IndexType j = IndexType(0); j < num1; j++) {
+          type1* test_ptr1 = test_array1 + N * j;
+          type1* check_ptr1 = check_array1 + N * j;
+          for (IndexType i = IndexType(0); i < begin1[j]; i++) {
+            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+          }
+          for (IndexType i = begin1[j];    i < end1[j];   i++) {
+            ASSERT_EQ(test_ptr1[i] + test_val1, check_ptr1[i]);
+          }
+          for (IndexType i = end1[j];      i < N;     i++) {
+            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+          }
+        }
+
+        for (IndexType j = IndexType(0); j < num2; j++) {
+          type2* test_ptr2 = test_array2 + N * j;
+          type2* check_ptr2 = check_array2 + N * j;
+          for (IndexType i = IndexType(0); i < begin2[j]; i++) {
+            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+          }
+          for (IndexType i = begin2[j];    i < end2[j];   i++) {
+            ASSERT_EQ(test_ptr2[i] + test_val2, check_ptr2[i]);
+          }
+          for (IndexType i = end2[j];      i < N;     i++) {
+            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+          }
+        }
+
+        for (IndexType j = IndexType(0); j < num3; j++) {
+          type3* test_ptr3 = test_array3 + N * j;
+          type3* check_ptr3 = check_array3 + N * j;
+          for (IndexType i = IndexType(0); i < begin3[j]; i++) {
+            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+          }
+          for (IndexType i = begin3[j];    i < end3[j];   i++) {
+            ASSERT_EQ(test_ptr3[i] + test_val3, check_ptr3[i]);
+          }
+          for (IndexType i = end3[j];      i < N;     i++) {
+            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+          }
+        }
+      }
+    }
+
+    site.clear();
+    group.clear();
+    pool.clear();
+  }
+
+
+  deallocateForallTestData<type1>(working_res,
+                                  working_array1,
+                                  check_array1,
+                                  test_array1);
+
+  deallocateForallTestData<type2>(working_res,
+                                  working_array2,
+                                  check_array2,
+                                  test_array2);
+
+  deallocateForallTestData<type3>(working_res,
+                                  working_array3,
+                                  check_array3,
+                                  test_array3);
+}
+
+
+template <typename T>
+class WorkGroupBasicOrderedSingleFunctionalTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_SUITE_P(WorkGroupBasicOrderedSingleFunctionalTest);
+
+template <typename T>
+class WorkGroupBasicOrderedMultipleReuseFunctionalTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_SUITE_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest);
+
+
+TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest, BasicWorkGroupOrderedSingle)
+{
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<3>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<4>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<5>>::type;
+
+  std::mt19937 rng(std::random_device{}());
+  using dist_type = std::uniform_int_distribution<IndexType>;
+
+  IndexType b1 = dist_type(IndexType(0), IndexType(15))(rng);
+  IndexType e1 = dist_type(b1, IndexType(16))(rng);
+
+  IndexType b2 = dist_type(e1, IndexType(127))(rng);
+  IndexType e2 = dist_type(b2, IndexType(128))(rng);
+
+  IndexType b3 = dist_type(e2, IndexType(1023))(rng);
+  IndexType e3 = dist_type(b3, IndexType(1024))(rng);
+
+  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator, WORKING_RESOURCE >(b1, e1);
+  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator, WORKING_RESOURCE >(b2, e2);
+  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator, WORKING_RESOURCE >(b3, e3);
+}
+
+TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest, BasicWorkGroupOrderedMultipleReuse)
+{
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<3>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<4>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<5>>::type;
+
+  std::mt19937 rng(std::random_device{}());
+  using dist_type = std::uniform_int_distribution<IndexType>;
+
+  IndexType num1 = dist_type(IndexType(0), IndexType(8))(rng);
+  IndexType num2 = dist_type(IndexType(0), IndexType(8))(rng);
+  IndexType num3 = dist_type(IndexType(0), IndexType(8))(rng);
+
+  IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
+  IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
+
+  testWorkGroupOrderedMultiple< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator, WORKING_RESOURCE >(
+      rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse, group_reuse);
+}
+
+#endif  //__TEST_WORKGROUP_ORDERED__
diff --git a/test/functional/workgroup/tests/test-workgroup-Unordered.hpp b/test/functional/workgroup/tests/test-workgroup-Unordered.hpp
new file mode 100644
index 0000000000..ddc9235b71
--- /dev/null
+++ b/test/functional/workgroup/tests/test-workgroup-Unordered.hpp
@@ -0,0 +1,447 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing tests for RAJA workgroup unordered runs.
+///
+
+#ifndef __TEST_WORKGROUP_UNORDERED__
+#define __TEST_WORKGROUP_UNORDERED__
+
+#include "RAJA_test-workgroup.hpp"
+#include "RAJA_test-forall-data.hpp"
+
+#include <random>
+
+
+template <typename ExecPolicy,
+          typename OrderPolicy,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES
+          >
+void testWorkGroupUnorderedSingle(IndexType begin, IndexType end)
+{
+  using WorkPool_type = RAJA::WorkPool<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkGroup_type = RAJA::WorkGroup<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkSite_type = RAJA::WorkSite<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  ASSERT_GE(begin, (IndexType)0);
+  ASSERT_GE(end, begin);
+  IndexType N = end + begin;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  IndexType* working_array;
+  IndexType* check_array;
+  IndexType* test_array;
+
+  allocateForallTestData<IndexType>(N,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+
+  {
+    for (IndexType i = IndexType(0); i < N; i++) {
+      test_array[i] = IndexType(0);
+    }
+
+    working_res.memcpy(working_array, test_array, sizeof(IndexType) * N);
+
+    for (IndexType i = begin; i < end; ++i) {
+      test_array[ i ] = IndexType(i);
+    }
+  }
+
+  WorkPool_type pool(Allocator{});
+
+  IndexType test_val(5);
+
+  {
+    pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin, end },
+        [=] RAJA_HOST_DEVICE (IndexType i) {
+      working_array[i] += i + test_val;
+    });
+  }
+
+  WorkGroup_type group = pool.instantiate();
+
+  WorkSite_type site = group.run();
+
+  working_res.memcpy(check_array, working_array, sizeof(IndexType) * N);
+
+  {
+    working_res.memcpy(check_array, working_array, sizeof(IndexType) * N);
+
+    for (IndexType i = IndexType(0); i < begin; i++) {
+      ASSERT_EQ(test_array[i], check_array[i]);
+    }
+    for (IndexType i = begin;        i < end;   i++) {
+      ASSERT_EQ(test_array[i] + test_val, check_array[i]);
+    }
+    for (IndexType i = end;          i < N;     i++) {
+      ASSERT_EQ(test_array[i], check_array[i]);
+    }
+  }
+
+
+  deallocateForallTestData<IndexType>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+
+template <typename ExecPolicy,
+          typename OrderPolicy,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES
+          >
+void testWorkGroupUnorderedMultiple(
+    std::mt19937& rng, IndexType max_begin, IndexType min_end,
+    IndexType num1, IndexType num2, IndexType num3,
+    IndexType pool_reuse, IndexType group_reuse)
+{
+  using WorkPool_type = RAJA::WorkPool<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkGroup_type = RAJA::WorkGroup<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkSite_type = RAJA::WorkSite<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  ASSERT_GT(min_end, max_begin);
+  IndexType N = min_end + max_begin;
+
+  std::vector<IndexType> begin1, end1;
+  std::vector<IndexType> begin2, end2;
+  std::vector<IndexType> begin3, end3;
+
+  {
+    using dist_type = std::uniform_int_distribution<IndexType>;
+
+    for (IndexType j = IndexType(0); j < num1; j++) {
+      begin1.push_back(dist_type(max_begin, min_end-1)(rng));
+      end1.push_back(dist_type(begin1.back(), min_end)(rng));
+    }
+
+    for (IndexType j = IndexType(0); j < num2; j++) {
+      begin2.push_back(dist_type(max_begin, min_end-1)(rng));
+      end2.push_back(dist_type(begin2.back(), min_end)(rng));
+    }
+
+    for (IndexType j = IndexType(0); j < num3; j++) {
+      begin3.push_back(dist_type(max_begin, min_end-1)(rng));
+      end3.push_back(dist_type(begin3.back(), min_end)(rng));
+    }
+  }
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  using type1 = IndexType;
+  using type2 = size_t;
+  using type3 = double;
+
+  type1* working_array1 = nullptr;
+  type1* check_array1 = nullptr;
+  type1* test_array1 = nullptr;
+
+  type2* working_array2 = nullptr;
+  type2* check_array2 = nullptr;
+  type2* test_array2 = nullptr;
+
+  type3* working_array3 = nullptr;
+  type3* check_array3 = nullptr;
+  type3* test_array3 = nullptr;
+
+  allocateForallTestData<type1>(N * num1,
+                                working_res,
+                                &working_array1,
+                                &check_array1,
+                                &test_array1);
+
+  allocateForallTestData<type2>(N * num2,
+                                working_res,
+                                &working_array2,
+                                &check_array2,
+                                &test_array2);
+
+  allocateForallTestData<type3>(N * num3,
+                                working_res,
+                                &working_array3,
+                                &check_array3,
+                                &test_array3);
+
+
+  WorkPool_type pool(Allocator{});
+
+  for (IndexType pr = 0; pr < pool_reuse; pr++) {
+
+    type1 test_val1(5);
+    type2 test_val2(7);
+    type3 test_val3(11);
+
+    // fill_pool(pool, type1(5), type2(7), type3(11));
+    {
+      for (IndexType j = IndexType(0); j < num1; j++) {
+        type1* working_ptr1 = working_array1 + N * j;
+        pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin1[j], end1[j] },
+            [=] RAJA_HOST_DEVICE (IndexType i) {
+          working_ptr1[i] += type1(i) + test_val1;
+        });
+      }
+
+      for (IndexType j = IndexType(0); j < num2; j++) {
+        type2* working_ptr2 = working_array2 + N * j;
+        pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin2[j], end2[j] },
+            [=] RAJA_HOST_DEVICE (IndexType i) {
+          working_ptr2[i] += type2(i) + test_val2;
+        });
+      }
+
+      for (IndexType j = IndexType(0); j < num3; j++) {
+        type3* working_ptr3 = working_array3 + N * j;
+        pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin3[j], end3[j] },
+            [=] RAJA_HOST_DEVICE (IndexType i) {
+          working_ptr3[i] += type3(i) + test_val3;
+        });
+      }
+    }
+
+    WorkGroup_type group = pool.instantiate();
+
+    for (IndexType gr = 0; gr < group_reuse; gr++) {
+
+      // set_test_data();
+      {
+        for (IndexType j = IndexType(0); j < num1; j++) {
+          type1* test_ptr1 = test_array1 + N * j;
+          for (IndexType i = IndexType(0); i < N; i++) {
+            test_ptr1[i] = type1(0);
+          }
+        }
+
+        for (IndexType j = IndexType(0); j < num2; j++) {
+          type2* test_ptr2 = test_array2 + N * j;
+          for (IndexType i = IndexType(0); i < N; i++) {
+            test_ptr2[i] = type2(0);
+          }
+        }
+
+        for (IndexType j = IndexType(0); j < num3; j++) {
+          type3* test_ptr3 = test_array3 + N * j;
+          for (IndexType i = IndexType(0); i < N; i++) {
+            test_ptr3[i] = type3(0);
+          }
+        }
+
+
+        working_res.memcpy(working_array1, test_array1, sizeof(type1) * N * num1);
+
+        working_res.memcpy(working_array2, test_array2, sizeof(type2) * N * num2);
+
+        working_res.memcpy(working_array3, test_array3, sizeof(type3) * N * num3);
+
+
+        for (IndexType j = IndexType(0); j < num1; j++) {
+          type1* test_ptr1 = test_array1 + N * j;
+          for (IndexType i = begin1[j]; i < end1[j]; ++i) {
+            test_ptr1[ i ] = type1(i);
+          }
+        }
+
+        for (IndexType j = IndexType(0); j < num2; j++) {
+          type2* test_ptr2 = test_array2 + N * j;
+          for (IndexType i = begin2[j]; i < end2[j]; ++i) {
+            test_ptr2[ i ] = type2(i);
+          }
+        }
+
+        for (IndexType j = IndexType(0); j < num3; j++) {
+          type3* test_ptr3 = test_array3 + N * j;
+          for (IndexType i = begin3[j]; i < end3[j]; ++i) {
+            test_ptr3[ i ] = type3(i);
+          }
+        }
+      }
+
+      WorkSite_type site = group.run();
+
+      // check_test_data(type1(5), type2(7), type3(11));
+      {
+        working_res.memcpy(check_array1, working_array1, sizeof(type1) * N * num1);
+
+        working_res.memcpy(check_array2, working_array2, sizeof(type2) * N * num2);
+
+        working_res.memcpy(check_array3, working_array3, sizeof(type3) * N * num3);
+
+
+        for (IndexType j = IndexType(0); j < num1; j++) {
+          type1* test_ptr1 = test_array1 + N * j;
+          type1* check_ptr1 = check_array1 + N * j;
+          for (IndexType i = IndexType(0); i < begin1[j]; i++) {
+            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+          }
+          for (IndexType i = begin1[j];    i < end1[j];   i++) {
+            ASSERT_EQ(test_ptr1[i] + test_val1, check_ptr1[i]);
+          }
+          for (IndexType i = end1[j];      i < N;     i++) {
+            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+          }
+        }
+
+        for (IndexType j = IndexType(0); j < num2; j++) {
+          type2* test_ptr2 = test_array2 + N * j;
+          type2* check_ptr2 = check_array2 + N * j;
+          for (IndexType i = IndexType(0); i < begin2[j]; i++) {
+            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+          }
+          for (IndexType i = begin2[j];    i < end2[j];   i++) {
+            ASSERT_EQ(test_ptr2[i] + test_val2, check_ptr2[i]);
+          }
+          for (IndexType i = end2[j];      i < N;     i++) {
+            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+          }
+        }
+
+        for (IndexType j = IndexType(0); j < num3; j++) {
+          type3* test_ptr3 = test_array3 + N * j;
+          type3* check_ptr3 = check_array3 + N * j;
+          for (IndexType i = IndexType(0); i < begin3[j]; i++) {
+            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+          }
+          for (IndexType i = begin3[j];    i < end3[j];   i++) {
+            ASSERT_EQ(test_ptr3[i] + test_val3, check_ptr3[i]);
+          }
+          for (IndexType i = end3[j];      i < N;     i++) {
+            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+          }
+        }
+      }
+    }
+
+    pool.clear();
+  }
+
+
+  deallocateForallTestData<type1>(working_res,
+                                  working_array1,
+                                  check_array1,
+                                  test_array1);
+
+  deallocateForallTestData<type2>(working_res,
+                                  working_array2,
+                                  check_array2,
+                                  test_array2);
+
+  deallocateForallTestData<type3>(working_res,
+                                  working_array3,
+                                  check_array3,
+                                  test_array3);
+}
+
+
+template <typename T>
+class WorkGroupBasicUnorderedSingleFunctionalTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_SUITE_P(WorkGroupBasicUnorderedSingleFunctionalTest);
+
+template <typename T>
+class WorkGroupBasicUnorderedMultipleReuseFunctionalTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_SUITE_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest);
+
+
+TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest, BasicWorkGroupUnorderedSingle)
+{
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<3>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<4>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<5>>::type;
+
+  std::mt19937 rng(std::random_device{}());
+  using dist_type = std::uniform_int_distribution<IndexType>;
+
+  IndexType b1 = dist_type(IndexType(0), IndexType(15))(rng);
+  IndexType e1 = dist_type(b1, IndexType(16))(rng);
+
+  IndexType b2 = dist_type(e1, IndexType(127))(rng);
+  IndexType e2 = dist_type(b2, IndexType(128))(rng);
+
+  IndexType b3 = dist_type(e2, IndexType(1023))(rng);
+  IndexType e3 = dist_type(b3, IndexType(1024))(rng);
+
+  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator, WORKING_RESOURCE >(b1, e1);
+  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator, WORKING_RESOURCE >(b2, e2);
+  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator, WORKING_RESOURCE >(b3, e3);
+}
+
+TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest, BasicWorkGroupUnorderedMultipleReuse)
+{
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<3>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<4>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<5>>::type;
+
+  std::mt19937 rng(std::random_device{}());
+  using dist_type = std::uniform_int_distribution<IndexType>;
+
+  IndexType num1 = dist_type(IndexType(0), IndexType(8))(rng);
+  IndexType num2 = dist_type(IndexType(0), IndexType(8))(rng);
+  IndexType num3 = dist_type(IndexType(0), IndexType(8))(rng);
+
+  IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
+  IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
+
+  testWorkGroupUnorderedMultiple< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator, WORKING_RESOURCE >(
+      rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse, group_reuse);
+}
+
+#endif  //__TEST_WORKGROUP_UNORDERED__
diff --git a/test/include/RAJA_gtest.hpp b/test/include/RAJA_gtest.hpp
index ca0e3635d5..a98b9c89e9 100644
--- a/test/include/RAJA_gtest.hpp
+++ b/test/include/RAJA_gtest.hpp
@@ -15,8 +15,15 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef RAJA_gtest_HPP
-#define RAJA_gtest_HPP
+#ifndef __RAJA_gtest_HPP__
+#define __RAJA_gtest_HPP__
+
+#ifdef RAJA_COMPILER_MSVC
+// disable some warnings for MSVC that we can't control, because they're emitted
+// by googletest headers
+#pragma warning( disable : 4244 )  // Force msvc to not emit conversion warning
+#pragma warning( disable : 4389 )  // Force msvc to not emit conversion warning
+#endif
 
 #include "gtest/gtest.h"
 
@@ -93,4 +100,10 @@
     void GTEST_SUITE_NAMESPACE_(                                        \
         SuiteName)::TestName<gtest_TypeParam_>::TestBody()
 
+
+#ifdef RAJA_COMPILER_MSVC
+#pragma warning( default : 4244 )  // reenable warning
+#pragma warning( default : 4389 )  // reenable warning
+#endif
+
 #endif  // closing endif for header file include guard
diff --git a/test/include/RAJA_test-atomic-types.hpp b/test/include/RAJA_test-atomic-types.hpp
new file mode 100644
index 0000000000..d7235d3ded
--- /dev/null
+++ b/test/include/RAJA_test-atomic-types.hpp
@@ -0,0 +1,35 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Types and type lists for loop indexing used throughout RAJA tests.
+//
+// Note that in the type lists, a subset of types is used by default.
+// For more comprehensive type testing define the macro RAJA_TEST_EXHAUSTIVE.
+//
+
+#ifndef __RAJA_test_atomic_types_HPP__
+#define __RAJA_test_atomic_types_HPP__
+
+#include "RAJA/RAJA.hpp"
+#include "camp/list.hpp"
+
+//
+// Atomic data types
+//
+using AtomicDataTypeList =
+  camp::list< RAJA::Index_type,
+              int,
+#if defined(RAJA_TEST_EXHAUSTIVE)
+              unsigned,
+              long long,
+              unsigned long long,
+              float,
+#endif
+              double >;
+
+#endif // __RAJA_test_atomic_types_HPP__
diff --git a/test/include/RAJA_test-atomicpol.hpp b/test/include/RAJA_test-atomicpol.hpp
new file mode 100644
index 0000000000..6d3d56263a
--- /dev/null
+++ b/test/include/RAJA_test-atomicpol.hpp
@@ -0,0 +1,58 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_ATOMICPOL__
+#define __TEST_ATOMICPOL__
+
+#include "RAJA/RAJA.hpp"
+#include "camp/list.hpp"
+
+using SequentialAtomicPols =
+  camp::list<
+#if defined(RAJA_TEST_EXHAUSTIVE)
+              RAJA::auto_atomic,
+              RAJA::builtin_atomic,
+#endif
+              RAJA::seq_atomic
+            >;
+
+#if defined(RAJA_ENABLE_OPENMP)
+using OpenMPAtomicPols =
+  camp::list<
+#if defined(RAJA_TEST_EXHAUSTIVE)
+              RAJA::omp_atomic,
+              RAJA::builtin_atomic,
+#endif
+              RAJA::auto_atomic
+            >;
+#endif  // RAJA_ENABLE_OPENMP
+
+#if defined(RAJA_ENABLE_CUDA)
+using CudaAtomicPols =
+  camp::list<
+#if defined(RAJA_TEST_EXHAUSTIVE)
+              RAJA::auto_atomic,
+#endif
+              RAJA::cuda_atomic
+            >;
+#endif  // RAJA_ENABLE_CUDA
+
+#if defined(RAJA_ENABLE_HIP)
+using HipAtomicPols =
+  camp::list<
+#if defined(RAJA_TEST_EXHAUSTIVE)
+               RAJA::auto_atomic,
+#endif
+               RAJA::hip_atomic
+            >;
+#endif  // RAJA_ENABLE_HIP
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+using OpenMPTargetAtomicPols = OpenMPAtomicPols;
+#endif
+
+#endif  // __TEST_ATOMICPOL__
diff --git a/test/include/RAJA_test-base.hpp b/test/include/RAJA_test-base.hpp
new file mode 100644
index 0000000000..52e20b4343
--- /dev/null
+++ b/test/include/RAJA_test-base.hpp
@@ -0,0 +1,31 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Fundamental includes and structs used throughout RAJA tests.
+//
+
+#ifndef __RAJA_test_base_HPP__
+#define __RAJA_test_base_HPP__
+
+#include "RAJA/RAJA.hpp"
+
+#include "gtest/gtest.h"
+
+//
+// Unroll types for gtest testing::Types
+//
+template <class T>
+struct Test;
+
+template <class... T>
+struct Test<camp::list<T...>> {
+  using Types = ::testing::Types<T...>;
+};
+
+
+#endif // __RAJA_test_base_HPP__
diff --git a/test/include/RAJA_test-camp.hpp b/test/include/RAJA_test-camp.hpp
new file mode 100644
index 0000000000..4591a57daf
--- /dev/null
+++ b/test/include/RAJA_test-camp.hpp
@@ -0,0 +1,45 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Camp header includes and helpers used throughout RAJA tests.
+//
+
+#ifndef __RAJA_test_camp_HPP__
+#define __RAJA_test_camp_HPP__
+
+#include "camp/resource.hpp"
+#include "camp/list.hpp"
+
+//
+// Memory resource types for back-end memory management
+//
+using HostResourceList = camp::list<camp::resources::Host>;
+
+using SequentialResourceList = HostResourceList;
+
+#if defined(RAJA_ENABLE_OPENMP)
+using OpenMPResourceList = HostResourceList;
+#endif
+
+#if defined(RAJA_ENABLE_TBB)
+using TBBResourceList = HostResourceList;
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+using CudaResourceList = camp::list<camp::resources::Cuda>;
+#endif
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+using OpenMPTargetResourceList = camp::list<camp::resources::Omp>;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+using HipResourceList = camp::list<camp::resources::Hip>;
+#endif
+
+#endif // __RAJA_test_camp_HPP__
diff --git a/test/include/RAJA_test-forall-data.hpp b/test/include/RAJA_test-forall-data.hpp
new file mode 100644
index 0000000000..cfd0bcfe01
--- /dev/null
+++ b/test/include/RAJA_test-forall-data.hpp
@@ -0,0 +1,46 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Utility routines for allocating/deallocating arrays in for forall tests. 
+//
+
+#ifndef __RAJA_test_forall_data_HPP__
+#define __RAJA_test_forall_data_HPP__
+
+#include "camp/resource.hpp"
+
+template<typename T>
+void allocateForallTestData(T N,
+                            camp::resources::Resource& work_res,
+                            T** work_array,
+                            T** check_array,
+                            T** test_array)
+{
+  camp::resources::Resource host_res{camp::resources::Host()};
+
+  *work_array = work_res.allocate<T>(RAJA::stripIndexType(N));
+
+  *check_array = host_res.allocate<T>(RAJA::stripIndexType(N));
+  *test_array = host_res.allocate<T>(RAJA::stripIndexType(N));
+}
+
+template<typename T>
+void deallocateForallTestData(camp::resources::Resource& work_res,
+                              T* work_array,
+                              T* check_array,
+                              T* test_array)
+{
+  camp::resources::Resource host_res{camp::resources::Host()};
+
+  work_res.deallocate(work_array);
+
+  host_res.deallocate(check_array);
+  host_res.deallocate(test_array);
+}
+
+#endif // __RAJA_test_forall_data_HPP__
diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp
new file mode 100644
index 0000000000..86ddd2eaf5
--- /dev/null
+++ b/test/include/RAJA_test-forall-execpol.hpp
@@ -0,0 +1,110 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Execution policy lists used throughout forall tests
+//
+
+#ifndef __RAJA_test_forall_execpol_HPP__
+#define __RAJA_test_forall_execpol_HPP__
+
+#include "RAJA/RAJA.hpp"
+#include "camp/list.hpp"
+
+// Sequential execution policy types
+using SequentialForallExecPols = camp::list< RAJA::seq_exec,
+                                             RAJA::loop_exec,
+                                             RAJA::simd_exec >;
+
+//
+// Sequential execution policy types for reduction and atomic tests.
+//
+// Note: RAJA::simd_exec does not work with these.
+//
+using SequentialForallReduceExecPols = camp::list< RAJA::seq_exec,
+                                                   RAJA::loop_exec >;
+
+using SequentialForallAtomicExecPols = camp::list< RAJA::seq_exec, 
+                                                   RAJA::loop_exec >;
+
+#if defined(RAJA_ENABLE_OPENMP)
+using OpenMPForallExecPols = 
+  camp::list< RAJA::omp_parallel_exec<RAJA::omp_for_nowait_exec>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_exec>
+#if defined(RAJA_TEST_EXHAUSTIVE)
+              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static<4>>>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static<8>>>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Dynamic<2>>>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Guided<3>>>
+#endif       
+             >;
+
+using OpenMPForallReduceExecPols = OpenMPForallExecPols;
+
+using OpenMPForallAtomicExecPols =
+  camp::list< RAJA::omp_parallel_exec<RAJA::omp_for_exec>
+#if defined(RAJA_TEST_EXHAUSTIVE)
+              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static<4>>>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static<8>>>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Dynamic<2>>>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Guided<3>>>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_exec>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Static<4>>>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Static<8>>>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Dynamic<2>>>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Guided<3>>>
+#endif
+            >; 
+
+#endif  // RAJA_ENABLE_OPENMP
+
+#if defined(RAJA_ENABLE_TBB)
+using TBBForallExecPols = camp::list< RAJA::tbb_for_exec,
+                                      RAJA::tbb_for_static< >,
+                                      RAJA::tbb_for_static< 2 >,
+                                      RAJA::tbb_for_static< 4 >,
+                                      RAJA::tbb_for_static< 8 >,
+                                      RAJA::tbb_for_dynamic >;
+
+using TBBForallReduceExecPols = TBBForallExecPols;
+
+using TBBForallAtomicExecPols = TBBForallExecPols;
+
+#endif
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+using OpenMPTargetForallExecPols =
+  camp::list< RAJA::omp_target_parallel_for_exec<8>,
+              RAJA::omp_target_parallel_for_exec_nt >;
+
+using OpenMPTargetForallReduceExecPols = OpenMPTargetForallExecPols;
+
+using OpenMPTargetForallAtomicExecPols = OpenMPTargetForallExecPols;
+
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+using CudaForallExecPols = camp::list< RAJA::cuda_exec<128>,
+                                       RAJA::cuda_exec<256> >;
+
+using CudaForallReduceExecPols = CudaForallExecPols;
+
+using CudaForallAtomicExecPols = CudaForallExecPols;
+
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+using HipForallExecPols = camp::list< RAJA::hip_exec<128>,
+                                      RAJA::hip_exec<256>  >;
+
+using HipForallReduceExecPols = HipForallExecPols;
+
+using HipForallAtomicExecPols = HipForallExecPols;
+
+#endif
+
+#endif  // __RAJA_test_forall_execpol_HPP__
diff --git a/test/include/RAJA_test-forall-indexset-execpol.hpp b/test/include/RAJA_test-forall-indexset-execpol.hpp
new file mode 100644
index 0000000000..82e49c5ee9
--- /dev/null
+++ b/test/include/RAJA_test-forall-indexset-execpol.hpp
@@ -0,0 +1,97 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __RAJA_test_forall_indexset_execpol_HPP__
+#define __RAJA_test_forall_indexset_execpol_HPP__
+
+#include "RAJA/RAJA.hpp"
+#include "camp/list.hpp"
+
+// Sequential execution policy types
+using SequentialForallIndexSetExecPols =
+  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::loop_exec>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::simd_exec> >;
+
+//
+// Sequential execution policy types for reduction tests.
+//
+// Note: RAJA::simd_exec does not work with these.
+//
+using SequentialForallIndexSetReduceExecPols =
+  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::loop_exec> >;
+
+#if defined(RAJA_ENABLE_OPENMP)
+using OpenMPForallIndexSetExecPols =  
+  camp::list< RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
+              RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::loop_exec>,
+              RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::simd_exec>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec> >;
+
+using OpenMPForallIndexSetReduceExecPols =
+  camp::list< RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
+              RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::loop_exec>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec> >;
+#endif
+
+#if defined(RAJA_ENABLE_TBB)
+using TBBForallIndexSetExecPols = 
+  camp::list< RAJA::ExecPolicy<RAJA::tbb_for_exec, RAJA::seq_exec>,
+              RAJA::ExecPolicy<RAJA::tbb_for_exec, RAJA::loop_exec>,
+              RAJA::ExecPolicy<RAJA::tbb_for_exec, RAJA::simd_exec>,
+              RAJA::ExecPolicy<RAJA::tbb_for_dynamic, RAJA::seq_exec>,
+              RAJA::ExecPolicy<RAJA::tbb_for_dynamic, RAJA::loop_exec>,
+              RAJA::ExecPolicy<RAJA::tbb_for_dynamic, RAJA::simd_exec>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::tbb_for_exec>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::tbb_for_static< >>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::tbb_for_static< 2 >>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::tbb_for_static< 4 >>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::tbb_for_static< 8 >>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::tbb_for_dynamic> >;
+
+using TBBForallIndexSetReduceExecPols =
+  camp::list< RAJA::ExecPolicy<RAJA::tbb_for_exec, RAJA::seq_exec>,
+              RAJA::ExecPolicy<RAJA::tbb_for_exec, RAJA::loop_exec>,
+              RAJA::ExecPolicy<RAJA::tbb_for_dynamic, RAJA::seq_exec>,
+              RAJA::ExecPolicy<RAJA::tbb_for_dynamic, RAJA::loop_exec>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::tbb_for_exec>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::tbb_for_static< >>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::tbb_for_static< 2 >>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::tbb_for_static< 4 >>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::tbb_for_static< 8 >>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::tbb_for_dynamic> >;
+#endif
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+using OpenMPTargetForallIndexSetExecPols =
+  camp::list< RAJA::ExecPolicy<RAJA::seq_segit,
+                               RAJA::omp_target_parallel_for_exec<8>>,
+              RAJA::ExecPolicy<RAJA::seq_segit, 
+                               RAJA::omp_target_parallel_for_exec_nt> >;
+
+using OpenMPTargetForallIndexSetReduceExecPols = 
+      OpenMPTargetForallIndexSetExecPols;
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+using CudaForallIndexSetExecPols =
+  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<128>>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<256>> >;
+
+using CudaForallIndexSetReduceExecPols = CudaForallIndexSetExecPols;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+using HipForallIndexSetExecPols =
+  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<128>>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<256>> >;
+
+using HipForallIndexSetReduceExecPols = HipForallIndexSetExecPols;
+#endif
+
+#endif  // __RAJA_test_forall_indexset_execpol_HPP__
diff --git a/test/include/RAJA_test-index-types.hpp b/test/include/RAJA_test-index-types.hpp
new file mode 100644
index 0000000000..8aff3e31cc
--- /dev/null
+++ b/test/include/RAJA_test-index-types.hpp
@@ -0,0 +1,62 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Types and type lists for loop indexing used throughout RAJA tests.
+//
+// Note that in the type lists, a subset of types is used by default.
+// For more comprehensive type testing define the macro RAJA_TEST_EXHAUSTIVE.
+//
+// Also, some tests may define their own index types to test.
+//
+
+#ifndef __RAJA_test_index_types_HPP__
+#define __RAJA_test_index_types_HPP__
+
+#include "RAJA/RAJA.hpp"
+#include "camp/list.hpp"
+
+//
+// Strongly typed indexes
+//
+RAJA_INDEX_VALUE(StrongIndexType, "StrongIndexType");
+RAJA_INDEX_VALUE_T(StrongInt, int, "StrongIntType");
+RAJA_INDEX_VALUE_T(StrongULL, unsigned long long , "StrongULLType");
+
+//
+// Standard index types list
+//
+using IdxTypeList = camp::list<RAJA::Index_type,
+                               int,
+#if defined(RAJA_TEST_EXHAUSTIVE)
+                               unsigned int,
+                               short,
+                               unsigned short,
+                               long int,
+                               unsigned long,
+                               long long,
+#endif
+                               unsigned long long>;
+//
+// Index types w/ Strong types list
+//
+using StrongIdxTypeList = camp::list<RAJA::Index_type,
+                                     int,
+                                     StrongIndexType,
+#if defined(RAJA_TEST_EXHAUSTIVE)
+                                     StrongInt,
+                                     unsigned int,
+                                     short,
+                                     unsigned short,
+                                     long int,
+                                     unsigned long,
+                                     long long,
+#endif
+                                     StrongULL,
+                                     unsigned long long>;
+
+#endif // __RAJA_test_index_types_HPP__
diff --git a/test/include/RAJA_test-indexset-build.hpp b/test/include/RAJA_test-indexset-build.hpp
new file mode 100644
index 0000000000..8d935dd4f5
--- /dev/null
+++ b/test/include/RAJA_test-indexset-build.hpp
@@ -0,0 +1,139 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Methods to construct index sets for RAJA tests.
+//
+
+#ifndef __TEST_FORALL_INDEXSET_BUILD_HPP__
+#define __TEST_FORALL_INDEXSET_BUILD_HPP__
+
+#include "RAJA/RAJA.hpp"
+#include "camp/resource.hpp"
+
+#include <random>
+
+//
+// Utility routine to construct index set with mix of Range, RangeStride, 
+// and List segments to use in various tests.
+//
+template <typename INDEX_TYPE,
+          typename RANGE_TYPE,
+          typename RANGESTRIDE_TYPE,
+          typename LIST_TYPE>
+void buildIndexSet( 
+  RAJA::TypedIndexSet< RANGE_TYPE, RANGESTRIDE_TYPE, LIST_TYPE >& iset, 
+  std::vector<INDEX_TYPE>& indices_out,
+  camp::resources::Resource& working_res )
+{
+  //
+  //  Build vector of integers for creating List segments.
+  //
+  std::default_random_engine gen;
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+
+  std::vector<INDEX_TYPE> lindices;
+  INDEX_TYPE idx = 0;
+  while (lindices.size() < 3000) {
+    double dval = dist(gen);
+    if (dval > 0.3) {
+      lindices.push_back(idx);
+    }
+    idx++;
+  }
+
+  //
+  // Construct a mix of Range, RangeStride, and List segments 
+  // and add them to index set
+  //
+  INDEX_TYPE rbeg = 0;
+  INDEX_TYPE rend = 0;
+  INDEX_TYPE stride = 0;
+  INDEX_TYPE last_idx = 0;
+  INDEX_TYPE lseg_len = static_cast<INDEX_TYPE>( lindices.size() );
+  std::vector<INDEX_TYPE> lseg(lseg_len);
+  std::vector<INDEX_TYPE> lseg_vec(lseg_len);
+
+  indices_out.clear(); 
+
+  // Create empty Range segment
+  rbeg = 1;
+  rend = 1;
+  iset.push_back(RANGE_TYPE(rbeg, rend));
+  last_idx = rend;
+
+  // Create Range segment
+  rbeg = 1;
+  rend = 1578;
+  iset.push_back(RANGE_TYPE(rbeg, rend));
+  for (INDEX_TYPE i = rbeg; i < rend; ++i) { 
+    indices_out.push_back( i ); 
+  }
+  last_idx = rend;
+
+  // Create List segment
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
+    lseg[i] = lindices[i] + last_idx + 3;
+    indices_out.push_back( lseg[i] );
+  }
+  iset.push_back(LIST_TYPE(&lseg[0], lseg_len, working_res));
+  last_idx = lseg[lseg_len - 1];
+
+  // Create List segment using alternate ctor
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
+    lseg_vec[i] = lindices[i] + last_idx + 3;
+    indices_out.push_back( lseg_vec[i] );
+  }
+  iset.push_back(LIST_TYPE(lseg_vec, working_res));
+  last_idx = lseg_vec[lseg_len - 1];
+
+  // Create Range-stride segment
+  rbeg = last_idx + 16;
+  rend = rbeg + 2040;
+  stride = 3;
+  iset.push_back(RANGESTRIDE_TYPE(rbeg, rend, stride));
+  for (INDEX_TYPE i = rbeg; i < rend; i += stride) { 
+    indices_out.push_back( i ); 
+  }
+  last_idx = rend;
+
+  // Create Range segment
+  rbeg = last_idx + 4;
+  rend = rbeg + 2759;
+  iset.push_back(RANGE_TYPE(rbeg, rend));
+  for (INDEX_TYPE i = rbeg; i < rend; ++i) { 
+    indices_out.push_back( i ); 
+  }
+  last_idx = rend;
+
+  // Create List segment
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
+    lseg[i] = lindices[i] + last_idx + 5;
+    indices_out.push_back( lseg[i] );
+  }
+  iset.push_back(LIST_TYPE(&lseg[0], lseg_len, working_res));
+  last_idx = lseg[lseg_len - 1];
+
+  // Create Range segment
+  rbeg = last_idx + 1;
+  rend = rbeg + 320;
+  iset.push_back(RANGE_TYPE(rbeg, rend));
+  for (INDEX_TYPE i = rbeg; i < rend; ++i) { 
+    indices_out.push_back( i ); 
+  }
+  last_idx = rend;
+
+  // Create List segment using alternate ctor
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
+    lseg_vec[i] = lindices[i] + last_idx + 7;
+    indices_out.push_back( lseg_vec[i] );
+  }
+  iset.push_back(LIST_TYPE(lseg_vec, working_res));
+  last_idx = lseg_vec[lseg_len - 1];
+}
+
+#endif  // __TEST_FORALL_INDEXSET_BUILD_HPP__
diff --git a/test/include/RAJA_test-platform.hpp b/test/include/RAJA_test-platform.hpp
new file mode 100644
index 0000000000..2ed8b5c777
--- /dev/null
+++ b/test/include/RAJA_test-platform.hpp
@@ -0,0 +1,52 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Platform header includes and helpers used throughout RAJA tests.
+//
+
+#ifndef __RAJA_test_platform_HPP__
+#define __RAJA_test_platform_HPP__
+
+#include "RAJA/RAJA.hpp"
+
+#include "camp/list.hpp"
+
+template < RAJA::Platform PLATFORM >
+struct PlatformHolder
+{
+   static const RAJA::Platform platform = PLATFORM;
+};
+
+//
+// Platform types
+//
+using HostPlatformList = camp::list<PlatformHolder<RAJA::Platform::host>>;
+
+using SequentialPlatformList = HostPlatformList;
+
+#if defined(RAJA_ENABLE_OPENMP)
+using OpenMPPlatformList = HostPlatformList;
+#endif
+
+#if defined(RAJA_ENABLE_TBB)
+using TBBPlatformList = HostPlatformList;
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+using CudaPlatformList = camp::list<PlatformHolder<RAJA::Platform::cuda>>;
+#endif
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+using OpenMPTargetPlatformList = camp::list<PlatformHolder<RAJA::Platform::omp_target>>;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+using HipPlatformList = camp::list<PlatformHolder<RAJA::Platform::hip>>;
+#endif
+
+#endif // __RAJA_test_platform_HPP__
diff --git a/test/include/RAJA_test-plugin-kernelpol.hpp b/test/include/RAJA_test-plugin-kernelpol.hpp
new file mode 100644
index 0000000000..915e5e6286
--- /dev/null
+++ b/test/include/RAJA_test-plugin-kernelpol.hpp
@@ -0,0 +1,122 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Kernel execution policy lists used throughout plugin tests
+//
+
+#ifndef __RAJA_test_plugin_kernelpol_HPP__
+#define __RAJA_test_plugin_kernelpol_HPP__
+
+#include "RAJA/RAJA.hpp"
+
+#include "camp/list.hpp"
+
+// Sequential execution policy types
+using SequentialPluginKernelExecPols = camp::list<
+      RAJA::KernelPolicy<
+        RAJA::statement::For<0, RAJA::seq_exec,
+          RAJA::statement::Lambda<0>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::Tile<0, RAJA::tile_fixed<2>, RAJA::loop_exec,
+          RAJA::statement::For<0, RAJA::seq_exec,
+            RAJA::statement::Lambda<0>>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::For<0, RAJA::loop_exec,
+          RAJA::statement::Lambda<0>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::Tile<0, RAJA::tile_fixed<2>, RAJA::loop_exec,
+          RAJA::statement::For<0, RAJA::loop_exec,
+            RAJA::statement::Lambda<0>>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::For<0, RAJA::simd_exec,
+          RAJA::statement::Lambda<0>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::Tile<0, RAJA::tile_fixed<2>, RAJA::loop_exec,
+          RAJA::statement::For<0, RAJA::simd_exec,
+            RAJA::statement::Lambda<0>>>>
+    >;
+
+#if defined(RAJA_ENABLE_OPENMP)
+using OpenMPPluginKernelExecPols = camp::list<
+      RAJA::KernelPolicy<
+        RAJA::statement::For<0, RAJA::omp_parallel_for_exec,
+          RAJA::statement::Lambda<0>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::Tile<0, RAJA::tile_fixed<2>, RAJA::omp_parallel_for_exec,
+          RAJA::statement::For<0, RAJA::loop_exec,
+            RAJA::statement::Lambda<0>>>>
+    >;
+#endif
+
+#if defined(RAJA_ENABLE_TBB)
+using TBBPluginKernelExecPols = camp::list<
+      RAJA::KernelPolicy<
+        RAJA::statement::For<0, RAJA::tbb_for_exec,
+          RAJA::statement::Lambda<0>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::Tile<0, RAJA::tile_fixed<2>, RAJA::tbb_for_exec,
+          RAJA::statement::For<0, RAJA::loop_exec,
+            RAJA::statement::Lambda<0>>>>
+    >;
+#endif
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+using OpenMPTargetPluginKernelExecPols = camp::list<
+      RAJA::KernelPolicy<
+        RAJA::statement::For<0, RAJA::omp_target_parallel_for_exec<64>,
+          RAJA::statement::Lambda<0>>>
+    >;
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+using CudaPluginKernelExecPols = camp::list<
+      RAJA::KernelPolicy<
+        RAJA::statement::CudaKernel<
+          RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
+            RAJA::statement::Lambda<0>>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::CudaKernel<
+          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::cuda_block_x_direct,
+            RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
+              RAJA::statement::Lambda<0>>>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::CudaKernelFixed<128,
+          RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
+            RAJA::statement::Lambda<0>>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::CudaKernelFixed<128,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::cuda_block_x_direct,
+            RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
+              RAJA::statement::Lambda<0>>>>>
+    >;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+using HipPluginKernelExecPols = camp::list<
+      RAJA::KernelPolicy<
+        RAJA::statement::HipKernel<
+          RAJA::statement::For<0, RAJA::hip_thread_x_loop,
+            RAJA::statement::Lambda<0>>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::HipKernel<
+          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::hip_block_x_direct,
+            RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+              RAJA::statement::Lambda<0>>>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::HipKernelFixed<128,
+          RAJA::statement::For<0, RAJA::hip_thread_x_loop,
+            RAJA::statement::Lambda<0>>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::HipKernelFixed<128,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::hip_block_x_direct,
+            RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+              RAJA::statement::Lambda<0>>>>>
+    >;
+#endif
+
+#endif  // __RAJA_test_plugin_kernelpol_HPP__
diff --git a/test/include/RAJA_test-reducepol.hpp b/test/include/RAJA_test-reducepol.hpp
new file mode 100644
index 0000000000..17a5be982e
--- /dev/null
+++ b/test/include/RAJA_test-reducepol.hpp
@@ -0,0 +1,48 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Reduction policies used for reduction tests
+//
+
+#ifndef __RAJA_test_reducepol_HPP__
+#define __RAJA_test_reducepol_HPP__
+
+#include "RAJA/RAJA.hpp"
+#include "camp/list.hpp"
+
+// Sequential reduction policy types
+using SequentialReducePols = camp::list< RAJA::seq_reduce >;
+
+#if defined(RAJA_ENABLE_OPENMP)
+using OpenMPReducePols = 
+#if 0 // is ordered reduction broken???
+  camp::list< RAJA::omp_reduce,
+              RAJA::omp_reduce_ordered >;
+#else
+  camp::list< RAJA::omp_reduce >;
+#endif
+#endif
+
+#if defined(RAJA_ENABLE_TBB)
+using TBBReducePols = camp::list< RAJA::tbb_reduce >;
+#endif
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+using OpenMPTargetReducePols =
+  camp::list< RAJA::omp_target_reduce >;
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+using CudaReducePols = camp::list< RAJA::cuda_reduce >;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+using HipReducePols = camp::list< RAJA::hip_reduce >;
+#endif
+
+#endif  // __RAJA_test_reducepol_HPP__
diff --git a/test/include/RAJA_test-teams-execpol.hpp b/test/include/RAJA_test-teams-execpol.hpp
new file mode 100644
index 0000000000..d50388a10f
--- /dev/null
+++ b/test/include/RAJA_test-teams-execpol.hpp
@@ -0,0 +1,101 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Execution policy lists used throughout teams tests
+//
+
+#ifndef __RAJA_test_teams_execpol_HPP__
+#define __RAJA_test_teams_execpol_HPP__
+
+#include "RAJA/RAJA.hpp"
+#include "camp/list.hpp"
+
+//Launch policies
+#if defined(RAJA_ENABLE_CUDA)
+using seq_cuda_policies = camp::list<
+  RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t,RAJA::expt::cuda_launch_t<false>>,
+  RAJA::expt::LoopPolicy<RAJA::loop_exec, RAJA::cuda_block_x_direct>,
+  RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_x_loop>>;
+
+using Sequential_launch_policies = camp::list<
+        seq_cuda_policies
+         >;
+
+#elif defined(RAJA_ENABLE_HIP)
+using seq_hip_policies = camp::list<
+  RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t,RAJA::expt::hip_launch_t<false>>,
+  RAJA::expt::LoopPolicy<RAJA::loop_exec, RAJA::hip_block_x_direct>,
+  RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>>;
+
+using Sequential_launch_policies = camp::list<
+         seq_hip_policies
+         >;
+#else
+using Sequential_launch_policies = camp::list<
+        camp::list<
+         RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t>,
+         RAJA::expt::LoopPolicy<RAJA::loop_exec>,
+         RAJA::expt::LoopPolicy<RAJA::loop_exec>>>;
+#endif // Sequential + device policies
+
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+#if defined(RAJA_ENABLE_CUDA)
+
+using omp_cuda_policies = camp::list<
+         RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t,RAJA::expt::cuda_launch_t<false>>,
+         RAJA::expt::LoopPolicy<RAJA::omp_parallel_for_exec, RAJA::cuda_block_x_direct>,
+         RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_x_loop>
+  >;
+
+using OpenMP_launch_policies = camp::list<
+         omp_cuda_policies
+         >;
+
+#elif defined(RAJA_ENABLE_HIP)
+
+using omp_hip_policies = camp::list<
+         RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t,RAJA::expt::hip_launch_t<false>>,
+         RAJA::expt::LoopPolicy<RAJA::omp_parallel_for_exec, RAJA::hip_block_x_direct>,
+         RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>
+  >;
+
+using OpenMP_launch_policies = camp::list<
+         omp_hip_policies
+         >;
+#else
+using OpenMP_launch_policies = camp::list<
+        camp::list<
+         RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t>,
+         RAJA::expt::LoopPolicy<RAJA::omp_parallel_for_exec>,
+         RAJA::expt::LoopPolicy<RAJA::loop_exec>>>;
+#endif
+
+#endif  // RAJA_ENABLE_OPENMP
+
+#if defined(RAJA_ENABLE_CUDA)
+using Cuda_launch_policies = camp::list<
+         seq_cuda_policies
+#if defined(RAJA_ENABLE_OPENMP)
+         , omp_cuda_policies
+#endif
+        >;
+#endif  // RAJA_ENABLE_CUDA
+
+#if defined(RAJA_ENABLE_HIP)
+using Hip_launch_policies = camp::list<
+         seq_hip_policies
+#if defined(RAJA_ENABLE_OPENMP)
+         , omp_hip_policies
+#endif
+        >;
+#endif // RAJA_ENABLE_HIP
+
+
+#endif  // __RAJA_test_teams_execpol_HPP__
diff --git a/test/include/RAJA_test-workgroup.hpp b/test/include/RAJA_test-workgroup.hpp
new file mode 100644
index 0000000000..4042418526
--- /dev/null
+++ b/test/include/RAJA_test-workgroup.hpp
@@ -0,0 +1,422 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_WORKGROUP_UTILS_HPP__
+#define __TEST_WORKGROUP_UTILS_HPP__
+
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+
+#include "RAJA_unit-test-forone.hpp"
+
+#include <cstddef>
+#include <limits>
+#include <new>
+#include <unordered_map>
+
+namespace detail {
+
+template < typename Resource >
+struct ResourceAllocator
+{
+  template < typename T >
+  struct std_allocator
+  {
+    using value_type = T;
+
+    std_allocator() = default;
+
+    std_allocator(std_allocator const&) = default;
+    std_allocator(std_allocator &&) = default;
+
+    std_allocator& operator=(std_allocator const&) = default;
+    std_allocator& operator=(std_allocator &&) = default;
+
+    template < typename U >
+    std_allocator(std_allocator<U> const& other) noexcept
+      : m_res(other.get_resource())
+    { }
+
+    /*[[nodiscard]]*/
+    value_type* allocate(size_t num)
+    {
+      if (num > std::numeric_limits<size_t>::max() / sizeof(value_type)) {
+        throw std::bad_alloc();
+      }
+
+      value_type* ptr = m_res.template allocate<value_type>(num);
+
+      if (!ptr) {
+        throw std::bad_alloc();
+      }
+
+      return ptr;
+    }
+
+    void deallocate(value_type* ptr, size_t) noexcept
+    {
+      m_res.deallocate(ptr);
+    }
+
+    Resource const& get_resource() const
+    {
+      return m_res;
+    }
+
+    template <typename U>
+    friend inline bool operator==(std_allocator const& /*lhs*/, std_allocator<U> const& /*rhs*/)
+    {
+      return true; // lhs.get_resource() == rhs.get_resource(); // TODO not equality comparable yet
+    }
+
+    template <typename U>
+    friend inline bool operator!=(std_allocator const& lhs, std_allocator<U> const& rhs)
+    {
+      return !(lhs == rhs);
+    }
+
+  private:
+    Resource m_res;
+  };
+};
+
+struct NeverEqualAllocator
+{
+  using propagate_on_container_copy_assignment = std::false_type;
+  using propagate_on_container_move_assignment = std::false_type;
+  using propagate_on_container_swap = std::true_type;
+
+  NeverEqualAllocator() = default;
+
+  NeverEqualAllocator(NeverEqualAllocator const&) = default;
+  NeverEqualAllocator(NeverEqualAllocator &&) = default;
+
+  NeverEqualAllocator& operator=(NeverEqualAllocator const&) = default;
+  NeverEqualAllocator& operator=(NeverEqualAllocator &&) = default;
+
+  NeverEqualAllocator select_on_container_copy_construction()
+  {
+    return NeverEqualAllocator{};
+  }
+
+  ~NeverEqualAllocator()
+  {
+    assert(m_allocations.empty());
+  }
+
+  /*[[nodiscard]]*/
+  void* allocate(size_t size)
+  {
+    void* ptr = malloc(size);
+    auto iter_b = m_allocations.emplace(ptr, size);
+    assert(iter_b.second);
+    return ptr;
+  }
+
+  void deallocate(void* ptr, size_t size) noexcept
+  {
+    auto iter = m_allocations.find(ptr);
+    assert(iter != m_allocations.end());
+    assert(iter->second == size);
+    m_allocations.erase(iter);
+    free(ptr);
+  }
+
+  bool operator==(NeverEqualAllocator const&) const
+  {
+    return false;
+  }
+
+private:
+  std::unordered_map<void*, size_t> m_allocations;
+};
+
+struct AlwaysEqualAllocator
+{
+  using propagate_on_container_copy_assignment = std::false_type;
+  using propagate_on_container_move_assignment = std::false_type;
+  using propagate_on_container_swap = std::false_type;
+
+  AlwaysEqualAllocator() = default;
+
+  AlwaysEqualAllocator(AlwaysEqualAllocator const&) = default;
+  AlwaysEqualAllocator(AlwaysEqualAllocator &&) = default;
+
+  AlwaysEqualAllocator& operator=(AlwaysEqualAllocator const&) = default;
+  AlwaysEqualAllocator& operator=(AlwaysEqualAllocator &&) = default;
+
+  AlwaysEqualAllocator select_on_container_copy_construction()
+  {
+    return *this;
+  }
+
+  /*[[nodiscard]]*/
+  void* allocate(size_t size)
+  {
+    return get_allocator().allocate(size);
+  }
+
+  void deallocate(void* ptr, size_t size) noexcept
+  {
+    get_allocator().deallocate(ptr, size);
+  }
+
+  bool operator==(AlwaysEqualAllocator const&) const
+  {
+    return true;
+  }
+
+private:
+  static inline NeverEqualAllocator& get_allocator()
+  {
+    static NeverEqualAllocator s_allocator;
+    return s_allocator;
+  }
+};
+
+struct PropogatingAllocator : NeverEqualAllocator
+{
+  using propagate_on_container_copy_assignment = std::true_type;
+  using propagate_on_container_move_assignment = std::true_type;
+  using propagate_on_container_swap = std::true_type;
+
+  PropogatingAllocator() = default;
+
+  PropogatingAllocator(PropogatingAllocator const&) = default;
+  PropogatingAllocator(PropogatingAllocator &&) = default;
+
+  PropogatingAllocator& operator=(PropogatingAllocator const&) = default;
+  PropogatingAllocator& operator=(PropogatingAllocator &&) = default;
+
+  PropogatingAllocator select_on_container_copy_construction()
+  {
+    return PropogatingAllocator(NeverEqualAllocator::select_on_container_copy_construction());
+  }
+
+private:
+  PropogatingAllocator(NeverEqualAllocator&& nea)
+    : NeverEqualAllocator(std::move(nea))
+  { }
+};
+
+template < typename AllocatorImpl >
+struct WorkStorageTestAllocator
+{
+  template < typename T >
+  struct std_allocator
+  {
+    using value_type = T;
+    using propagate_on_container_copy_assignment = typename AllocatorImpl::propagate_on_container_copy_assignment;
+    using propagate_on_container_move_assignment = typename AllocatorImpl::propagate_on_container_move_assignment;
+    using propagate_on_container_swap = typename AllocatorImpl::propagate_on_container_swap;
+
+    std_allocator() = default;
+
+    std_allocator(std_allocator const&) = default;
+    std_allocator(std_allocator &&) = default;
+
+    std_allocator& operator=(std_allocator const&) = default;
+    std_allocator& operator=(std_allocator &&) = default;
+
+    template < typename U >
+    std_allocator(std_allocator<U> const& other) noexcept
+      : m_impl(other.get_impl())
+    { }
+
+    std_allocator select_on_container_copy_construction()
+    {
+      return std_allocator(m_impl.select_on_container_copy_construction());
+    }
+
+    /*[[nodiscard]]*/
+    value_type* allocate(size_t num)
+    {
+      if (num > std::numeric_limits<size_t>::max() / sizeof(value_type)) {
+        throw std::bad_alloc();
+      }
+
+      value_type* ptr = static_cast<value_type*>(m_impl.allocate(num*sizeof(value_type)));
+
+      if (!ptr) {
+        throw std::bad_alloc();
+      }
+
+      return ptr;
+    }
+
+    void deallocate(value_type* ptr, size_t num) noexcept
+    {
+      m_impl.deallocate(static_cast<void*>(ptr), num*sizeof(value_type));
+    }
+
+    AllocatorImpl const& get_impl() const
+    {
+      return m_impl;
+    }
+
+    template <typename U>
+    friend inline bool operator==(std_allocator const& lhs, std_allocator<U> const& rhs)
+    {
+      return lhs.get_impl() == rhs.get_impl();
+    }
+
+    template <typename U>
+    friend inline bool operator!=(std_allocator const& lhs, std_allocator<U> const& rhs)
+    {
+      return !(lhs == rhs);
+    }
+
+  private:
+    std_allocator(AllocatorImpl&& impl)
+      : m_impl(std::move(impl))
+    { }
+
+    AllocatorImpl m_impl;
+  };
+};
+
+} // namespace detail
+
+
+//
+// Data types
+//
+using IndexTypeTypeList = camp::list<
+                                 int,
+                                 long,
+                                 RAJA::Index_type
+                               >;
+
+using XargsTypeList = camp::list<
+                                 RAJA::xargs<>,
+                                 RAJA::xargs<int*>,
+                                 RAJA::xargs<int, int*>
+                               >;
+
+using SequentialExecPolicyList =
+    camp::list<
+                RAJA::loop_work,
+                RAJA::seq_work
+              >;
+using SequentialOrderedPolicyList =
+    camp::list<
+                RAJA::ordered,
+                RAJA::reverse_ordered
+              >;
+using SequentialOrderPolicyList =
+    camp::list<
+                RAJA::ordered,
+                RAJA::reverse_ordered
+              >;
+using SequentialStoragePolicyList =
+    camp::list<
+                RAJA::array_of_pointers,
+                RAJA::ragged_array_of_objects,
+                RAJA::constant_stride_array_of_objects
+              >;
+
+#if defined(RAJA_ENABLE_TBB)
+using TBBExecPolicyList =
+    camp::list<
+                RAJA::tbb_work
+              >;
+using TBBOrderedPolicyList = SequentialOrderedPolicyList;
+using TBBOrderPolicyList   = SequentialOrderPolicyList;
+using TBBStoragePolicyList = SequentialStoragePolicyList;
+#endif
+
+#if defined(RAJA_ENABLE_OPENMP)
+using OpenMPExecPolicyList =
+    camp::list<
+                RAJA::omp_work
+              >;
+using OpenMPOrderedPolicyList = SequentialOrderedPolicyList;
+using OpenMPOrderPolicyList   = SequentialOrderPolicyList;
+using OpenMPStoragePolicyList = SequentialStoragePolicyList;
+#endif
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+using OpenMPTargetExecPolicyList =
+    camp::list<
+                RAJA::omp_target_work
+              >;
+using OpenMPTargetOrderedPolicyList = SequentialOrderedPolicyList;
+using OpenMPTargetOrderPolicyList   = SequentialOrderPolicyList;
+using OpenMPTargetStoragePolicyList = SequentialStoragePolicyList;
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+using CudaExecPolicyList =
+    camp::list<
+                RAJA::cuda_work<256>,
+                RAJA::cuda_work<1024>
+              >;
+using CudaOrderedPolicyList = SequentialOrderedPolicyList;
+using CudaOrderPolicyList   =
+    camp::list<
+                RAJA::ordered,
+                RAJA::reverse_ordered,
+                RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average
+              >;
+using CudaStoragePolicyList = SequentialStoragePolicyList;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+using HipExecPolicyList =
+    camp::list<
+                RAJA::hip_work<256>,
+                RAJA::hip_work<1024>
+              >;
+using HipOrderedPolicyList = SequentialOrderedPolicyList;
+using HipOrderPolicyList   =
+    camp::list<
+                RAJA::ordered,
+                RAJA::reverse_ordered
+#if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
+              , RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average
+#endif
+              >;
+using HipStoragePolicyList = SequentialStoragePolicyList;
+#endif
+
+
+//
+// Memory resource Allocator types
+//
+using HostAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Host>::template std_allocator<char>>;
+
+using SequentialAllocatorList = HostAllocatorList;
+
+#if defined(RAJA_ENABLE_TBB)
+using TBBAllocatorList = HostAllocatorList;
+#endif
+
+#if defined(RAJA_ENABLE_OPENMP)
+using OpenMPAllocatorList = HostAllocatorList;
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+using CudaAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Cuda>::template std_allocator<char>>;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+using HipAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Hip>::template std_allocator<char>>;
+#endif
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+using OpenMPTargetAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Omp>::template std_allocator<char>>;
+#endif
+
+
+//
+// Memory resource types for testing different std allocator requirements
+//
+using WorkStorageAllocatorList = camp::list<typename detail::WorkStorageTestAllocator<detail::AlwaysEqualAllocator>::template std_allocator<char>,
+                                            typename detail::WorkStorageTestAllocator<detail::NeverEqualAllocator>::template std_allocator<char>,
+                                            typename detail::WorkStorageTestAllocator<detail::PropogatingAllocator>::template std_allocator<char>>;
+
+#endif  // __TEST_WORKGROUP_UTILS_HPP__
diff --git a/test/include/RAJA_unit-test-forone.hpp b/test/include/RAJA_unit-test-forone.hpp
new file mode 100644
index 0000000000..295caf71f1
--- /dev/null
+++ b/test/include/RAJA_unit-test-forone.hpp
@@ -0,0 +1,181 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Header defining "for one" unit test utility so that constructs can be
+// tested outside of standard RAJA kernel launch utilities (forall, kernel).
+//
+
+#ifndef __RAJA_test_forone_HPP__
+#define __RAJA_test_forone_HPP__
+
+#include "RAJA/RAJA.hpp"
+#include "camp/camp.hpp"
+
+#include <type_traits>
+
+///
+/// forone<forone_policy>( [=] RAJA_HOST_DEVICE(){ /* code to test */ } );
+///
+template < typename forone_policy, typename L >
+inline void forone(L&& run);
+
+// base classes to represent host or device in exec_dispatcher
+struct RunOnHost {};
+struct RunOnDevice {};
+
+// sequential forone policy
+struct forone_seq : public RunOnHost  { };
+
+// struct with specializations containing information about forone policies
+template < typename forone_policy >
+struct forone_policy_info;
+
+// alias for equivalent RAJA exec policy to given forone policy
+template < typename forone_policy >
+using forone_equivalent_exec_policy = typename forone_policy_info<forone_policy>::type;
+
+// alias for platform of given forone policy
+template < typename forone_policy >
+using forone_platform = typename forone_policy_info<forone_policy>::platform;
+
+
+// forone_seq policy information
+template < >
+struct forone_policy_info<forone_seq>
+{
+  using type = RAJA::loop_exec;
+  using platform = RunOnHost;
+  static const char* name() { return "forone_seq"; }
+};
+
+// forone_seq implementation
+template < typename L >
+inline void forone(forone_seq, L&& run)
+{
+  std::forward<L>(run)();
+}
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+// cuda forone policy
+struct forone_openmp_target : public RunOnHost { };
+
+// forone_openmp_target policy information
+template < >
+struct forone_policy_info<forone_openmp_target>
+{
+  using type = RAJA::omp_target_parallel_for_exec<1>;
+  using platform = RunOnHost;
+  static const char* name() { return "forone_openmp_target"; }
+};
+
+// forone_openmp_target implementation
+template < typename L >
+inline void forone(forone_openmp_target, L&& run)
+{
+#pragma omp target
+  run();
+}
+
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+
+// cuda forone policy
+struct forone_cuda : public RunOnDevice { };
+
+// forone_cuda policy information
+template < >
+struct forone_policy_info<forone_cuda>
+{
+  using type = RAJA::cuda_exec<1>;
+  using platform = RunOnDevice;
+  static const char* name() { return "forone_cuda"; }
+};
+
+template <typename L>
+__global__ void forone_cuda_global(L run)
+{
+  run();
+}
+
+// forone_cuda implementation
+template < typename L >
+inline void forone(forone_cuda, L&& run)
+{
+   forone_cuda_global<<<1,1>>>(std::forward<L>(run));
+   cudaErrchk(cudaGetLastError());
+   cudaErrchk(cudaDeviceSynchronize());
+}
+
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+
+// hip forone policy
+struct forone_hip : public RunOnDevice { };
+
+// forone_hip policy information
+template < >
+struct forone_policy_info<forone_hip>
+{
+  using type = RAJA::hip_exec<1>;
+  using platform = RunOnDevice;
+  static const char* name() { return "forone_hip"; }
+};
+
+template <typename L>
+__global__ void forone_hip_global(L run)
+{
+  run();
+}
+
+// forone_hip implementation
+template < typename L >
+inline void forone(forone_hip, L&& run)
+{
+   hipLaunchKernelGGL(forone_hip_global<camp::decay<L>>, dim3(1), dim3(1), 0, 0, std::forward<L>(run));
+   hipErrchk(hipGetLastError());
+   hipErrchk(hipDeviceSynchronize());
+}
+
+#endif
+
+template < typename forone_policy, typename L >
+void forone(L&& run)
+{
+  forone(forone_policy{}, std::forward<L>(run));
+}
+
+
+//
+// Forone unit test policies
+//
+using SequentialForoneList = camp::list<forone_seq>;
+
+#if defined(RAJA_ENABLE_TBB)
+using TBBForoneList = SequentialForoneList;
+#endif
+
+#if defined(RAJA_ENABLE_OPENMP)
+using OpenMPForoneList = SequentialForoneList;
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+using CudaForoneList = camp::list<forone_cuda>;
+#endif
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+using OpenMPTargetForoneList = camp::list<forone_openmp_target>;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+using HipForoneList = camp::list<forone_hip>;
+#endif
+
+#endif // RAJA_test_forone_HPP__
diff --git a/test/include/RAJA_unit-test-types.hpp b/test/include/RAJA_unit-test-types.hpp
new file mode 100644
index 0000000000..9a51d23c9b
--- /dev/null
+++ b/test/include/RAJA_unit-test-types.hpp
@@ -0,0 +1,87 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Types used in RAJA unit tests (template params to GTest)
+//
+
+#ifndef __RAJA_unit_test_types_HPP__
+#define __RAJA_unit_test_types_HPP__
+
+#include "RAJA_test-base.hpp"
+
+//
+// List of integral types used in RAJA index unit tests
+//
+using UnitIntegralTypes = ::testing::Types<char,
+                                           unsigned char,
+                                           short,
+                                           unsigned short,
+                                           int,
+                                           unsigned int,
+                                           long,
+                                           unsigned long,
+                                           long int,
+                                           unsigned long int,
+                                           long long,
+                                           unsigned long long>;
+
+//
+// Expanded integral types used in RAJA index unit tests
+//
+#ifndef RAJA_UNIT_EXPANDED_INTEGRAL_TYPES
+  #define RAJA_UNIT_EXPANDED_INTEGRAL_TYPES \
+    RAJA::Index_type,    \
+    char,                \
+    unsigned char,       \
+    short,               \
+    unsigned short,      \
+    int,                 \
+    unsigned int,        \
+    long,                \
+    unsigned long,       \
+    long int,            \
+    unsigned long int,   \
+    long long,           \
+    unsigned long long
+#endif // RAJA_UNIT_EXPANDED_INTEGRAL_TYPES
+
+#ifndef RAJA_UNIT_FLOAT_TYPES
+  #define RAJA_UNIT_FLOAT_TYPES \
+    float,               \
+    double,              \
+    long double
+#endif // FLOATING_TYPES
+
+using UnitExpandedIntegralTypes = 
+  ::testing::Types<RAJA_UNIT_EXPANDED_INTEGRAL_TYPES>;
+
+using UnitFloatTypes = ::testing::Types<RAJA_UNIT_FLOAT_TYPES>;
+
+using UnitIntFloatTypes = 
+  ::testing::Types<RAJA_UNIT_EXPANDED_INTEGRAL_TYPES,RAJA_UNIT_FLOAT_TYPES>;
+
+//
+// Standard list of index types used in RAJA index unit tests
+//
+using UnitIndexTypes = ::testing::Types<RAJA::Index_type,
+                                        int,
+#if defined(RAJA_TEST_EXHAUSTIVE)
+                                        unsigned int,
+                                        char,
+                                        unsigned char,
+                                        short,
+                                        unsigned short,
+                                        long,
+                                        unsigned long,
+                                        long int,
+                                        unsigned long int,
+                                        long long,
+#endif
+                                        unsigned long long>;
+
+#endif  // __RAJA_unit_test_types_HPP__
diff --git a/test/integration/CMakeLists.txt b/test/integration/CMakeLists.txt
index 36563e2433..2b5ddea642 100644
--- a/test/integration/CMakeLists.txt
+++ b/test/integration/CMakeLists.txt
@@ -3,8 +3,49 @@
 # and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
-################################################################################
+###############################################################################
+
+list(APPEND PLUGIN_BACKENDS Sequential)
+
+if(RAJA_ENABLE_OPENMP)
+  list(APPEND PLUGIN_BACKENDS OpenMP)
+endif()
+
+if(RAJA_ENABLE_TBB)
+  list(APPEND PLUGIN_BACKENDS TBB)
+endif()
+
+if(RAJA_ENABLE_CUDA)
+  list(APPEND PLUGIN_BACKENDS Cuda)
+endif()
+
+if(RAJA_ENABLE_HIP)
+  list(APPEND PLUGIN_BACKENDS Hip)
+endif()
+
+if(RAJA_ENABLE_TARGET_OPENMP)
+  list(APPEND PLUGIN_BACKENDS OpenMPTarget)
+endif()
+
+add_subdirectory(plugin)
+
+if(NOT WIN32)
+raja_add_test(
+  NAME test-plugin-dynamic
+  SOURCES test_plugin_dynamic.cpp)
+
+raja_add_plugin_library(NAME dynamic_plugin
+                        SHARED TRUE
+                        SOURCES plugin_for_test_dynamic.cpp)
 
 raja_add_test(
-  NAME test-plugin
-  SOURCES test_plugin.cpp plugin_for_test.cpp)
+  NAME test-plugin-kokkos
+  SOURCES test_plugin_kokkos.cpp)
+
+raja_add_plugin_library(NAME kokkos_plugin
+                        SHARED TRUE
+                        SOURCES plugin_for_test_kokkos.cpp)
+
+set_tests_properties(test-plugin-kokkos.exe PROPERTIES
+                     ENVIRONMENT "KOKKOS_PLUGINS=${CMAKE_BINARY_DIR}/lib/libkokkos_plugin.so")
+endif()
\ No newline at end of file
diff --git a/test/integration/plugin/CMakeLists.txt b/test/integration/plugin/CMakeLists.txt
new file mode 100644
index 0000000000..23faae6426
--- /dev/null
+++ b/test/integration/plugin/CMakeLists.txt
@@ -0,0 +1,45 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+################################################################################
+
+#
+# Generate tests for each enabled RAJA back-end.
+#
+# Note: PLUGIN_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( BACKEND ${PLUGIN_BACKENDS} )
+  configure_file( test-plugin-forall.cpp.in
+                  test-plugin-forall-${BACKEND}.cpp )
+  raja_add_test( NAME test-plugin-forall-${BACKEND}
+                 SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-plugin-forall-${BACKEND}.cpp
+                         plugin_to_test.cpp )
+
+  target_include_directories(test-plugin-forall-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+endforeach()
+
+foreach( BACKEND ${PLUGIN_BACKENDS} )
+  configure_file( test-plugin-kernel.cpp.in
+                  test-plugin-kernel-${BACKEND}.cpp )
+  raja_add_test( NAME test-plugin-kernel-${BACKEND}
+                 SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-plugin-kernel-${BACKEND}.cpp
+                         plugin_to_test.cpp )
+
+  target_include_directories(test-plugin-kernel-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+endforeach()
+
+foreach( BACKEND ${PLUGIN_BACKENDS} )
+  configure_file( test-plugin-workgroup.cpp.in
+                  test-plugin-workgroup-${BACKEND}.cpp )
+  raja_add_test( NAME test-plugin-workgroup-${BACKEND}
+                 SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-plugin-workgroup-${BACKEND}.cpp
+                         plugin_to_test.cpp )
+
+  target_include_directories(test-plugin-workgroup-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+endforeach()
+
diff --git a/test/integration/plugin/plugin_to_test.cpp b/test/integration/plugin/plugin_to_test.cpp
new file mode 100644
index 0000000000..2e9757b282
--- /dev/null
+++ b/test/integration/plugin/plugin_to_test.cpp
@@ -0,0 +1,77 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+#include "RAJA/util/PluginStrategy.hpp"
+
+#include "gtest/gtest.h"
+
+#include <iostream>
+
+#include "counter.hpp"
+
+class CounterPlugin :
+  public RAJA::util::PluginStrategy
+{
+  public:
+  void preCapture(const RAJA::util::PluginContext& p) override {
+    ASSERT_NE(plugin_test_data, nullptr);
+    ASSERT_NE(plugin_test_resource, nullptr);
+
+    CounterData data;
+    plugin_test_resource->memcpy(&data, plugin_test_data, sizeof(CounterData));
+
+    ASSERT_EQ(data.capture_platform_active, RAJA::Platform::undefined);
+    data.capture_counter_pre++;
+    data.capture_platform_active = p.platform;
+
+    plugin_test_resource->memcpy(plugin_test_data, &data, sizeof(CounterData));
+  }
+
+  void postCapture(const RAJA::util::PluginContext& p) override {
+    ASSERT_NE(plugin_test_data, nullptr);
+    ASSERT_NE(plugin_test_resource, nullptr);
+
+    CounterData data;
+    plugin_test_resource->memcpy(&data, plugin_test_data, sizeof(CounterData));
+
+    ASSERT_EQ(data.capture_platform_active, p.platform);
+    data.capture_counter_post++;
+    data.capture_platform_active = RAJA::Platform::undefined;
+
+    plugin_test_resource->memcpy(plugin_test_data, &data, sizeof(CounterData));
+  }
+
+  void preLaunch(const RAJA::util::PluginContext& p) override {
+    ASSERT_NE(plugin_test_data, nullptr);
+    ASSERT_NE(plugin_test_resource, nullptr);
+
+    CounterData data;
+    plugin_test_resource->memcpy(&data, plugin_test_data, sizeof(CounterData));
+
+    ASSERT_EQ(data.launch_platform_active, RAJA::Platform::undefined);
+    data.launch_counter_pre++;
+    data.launch_platform_active = p.platform;
+
+    plugin_test_resource->memcpy(plugin_test_data, &data, sizeof(CounterData));
+  }
+
+  void postLaunch(const RAJA::util::PluginContext& p) override {
+    ASSERT_NE(plugin_test_data, nullptr);
+    ASSERT_NE(plugin_test_resource, nullptr);
+
+    CounterData data;
+    plugin_test_resource->memcpy(&data, plugin_test_data, sizeof(CounterData));
+
+    ASSERT_EQ(data.launch_platform_active, p.platform);
+    data.launch_counter_post++;
+    data.launch_platform_active = RAJA::Platform::undefined;
+
+    plugin_test_resource->memcpy(plugin_test_data, &data, sizeof(CounterData));
+  }
+};
+
+// Statically loading plugin.
+static RAJA::util::PluginRegistry::add<CounterPlugin> P("counter-plugin", "Counter");
diff --git a/test/integration/plugin/test-plugin-forall.cpp.in b/test/integration/plugin/test-plugin-forall.cpp.in
new file mode 100644
index 0000000000..0af25aa76b
--- /dev/null
+++ b/test/integration/plugin/test-plugin-forall.cpp.in
@@ -0,0 +1,38 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-platform.hpp"
+
+#include "RAJA_test-forall-execpol.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-plugin-forall.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@PluginForallTypes =
+  Test< camp::cartesian_product<@BACKEND@ForallExecPols,
+                                @BACKEND@ResourceList,
+                                @BACKEND@PlatformList > >::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               PluginForallTest,
+                               @BACKEND@PluginForallTypes);
diff --git a/test/integration/plugin/test-plugin-kernel.cpp.in b/test/integration/plugin/test-plugin-kernel.cpp.in
new file mode 100644
index 0000000000..e0fc482de3
--- /dev/null
+++ b/test/integration/plugin/test-plugin-kernel.cpp.in
@@ -0,0 +1,38 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-platform.hpp"
+
+#include "RAJA_test-plugin-kernelpol.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-plugin-kernel.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@PluginKernelTypes =
+  Test< camp::cartesian_product<@BACKEND@PluginKernelExecPols,
+                                @BACKEND@ResourceList,
+                                @BACKEND@PlatformList > >::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               PluginKernelTest,
+                               @BACKEND@PluginKernelTypes);
diff --git a/test/integration/plugin/test-plugin-workgroup.cpp.in b/test/integration/plugin/test-plugin-workgroup.cpp.in
new file mode 100644
index 0000000000..a778350276
--- /dev/null
+++ b/test/integration/plugin/test-plugin-workgroup.cpp.in
@@ -0,0 +1,42 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-platform.hpp"
+
+#include "RAJA_test-workgroup.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-plugin-workgroup.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@PluginWorkGroupTypes =
+  Test< camp::cartesian_product<@BACKEND@ExecPolicyList,
+                                @BACKEND@OrderPolicyList,
+                                @BACKEND@StoragePolicyList,
+                                IndexTypeTypeList,
+                                @BACKEND@AllocatorList,
+                                @BACKEND@ResourceList,
+                                @BACKEND@PlatformList > >::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               PluginWorkGroupTest,
+                               @BACKEND@PluginWorkGroupTypes);
diff --git a/test/integration/plugin/tests/counter.hpp b/test/integration/plugin/tests/counter.hpp
new file mode 100644
index 0000000000..ecd3e9dcef
--- /dev/null
+++ b/test/integration/plugin/tests/counter.hpp
@@ -0,0 +1,27 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+#ifndef  RAJA_counter_HPP
+#define  RAJA_counter_HPP
+
+
+struct CounterData
+{
+  RAJA::Platform capture_platform_active = RAJA::Platform::undefined;
+  int            capture_counter_pre     = 0;
+  int            capture_counter_post    = 0;
+  RAJA::Platform launch_platform_active = RAJA::Platform::undefined;
+  int            launch_counter_pre     = 0;
+  int            launch_counter_post    = 0;
+};
+
+// note the use of a pointer here to allow different types of memory
+// to be used
+extern CounterData* plugin_test_data;
+
+extern camp::resources::Resource* plugin_test_resource;
+
+#endif  // RAJA_counter_HPP
diff --git a/test/integration/plugin/tests/test-plugin-forall.hpp b/test/integration/plugin/tests/test-plugin-forall.hpp
new file mode 100644
index 0000000000..c78524acef
--- /dev/null
+++ b/test/integration/plugin/tests/test-plugin-forall.hpp
@@ -0,0 +1,243 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing basic integration tests for plugins with forall.
+///
+
+#ifndef __TEST_PLUGIN_FORALL_HPP__
+#define __TEST_PLUGIN_FORALL_HPP__
+
+#include "test-plugin.hpp"
+
+
+// Check that the plugin is called with the right Platform.
+// Check that the plugin is called the correct number of times,
+// once before and after each kernel capture for the capture counter,
+// once before and after each kernel invocation for the launch counter.
+
+// test with basic forall
+template <typename ExecPolicy,
+          typename WORKING_RES,
+          RAJA::Platform PLATFORM>
+void PluginForallTestImpl()
+{
+  SetupPluginVars spv(WORKING_RES::get_default());
+
+  CounterData* data = plugin_test_resource->allocate<CounterData>(10);
+
+  for (int i = 0; i < 10; i++) {
+
+    RAJA::forall<ExecPolicy>(
+      RAJA::RangeSegment(i,i+1),
+      PluginTestCallable{data}
+    );
+
+    CounterData loop_data;
+    plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
+    ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
+    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
+    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.launch_counter_post,    i);
+  }
+
+  CounterData plugin_data;
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
+  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+
+  plugin_test_resource->deallocate(data);
+}
+
+// test with basic forall_Icount
+template <typename ExecPolicy,
+          typename WORKING_RES,
+          RAJA::Platform PLATFORM>
+void PluginForAllICountTestImpl()
+{
+  SetupPluginVars spv(WORKING_RES::get_default());
+
+  CounterData* data = plugin_test_resource->allocate<CounterData>(10);
+
+  for (int i = 0; i < 10; i++) {
+
+    RAJA::forall_Icount<ExecPolicy>(
+      RAJA::RangeSegment(i,i+1), i,
+      PluginTestCallable{data}
+    );
+
+    CounterData loop_data;
+    plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
+    ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
+    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
+    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.launch_counter_post,    i);
+  }
+
+  CounterData plugin_data;
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
+  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+
+  plugin_test_resource->deallocate(data);
+}
+
+// test with IndexSet forall
+template <typename ExecPolicy,
+          typename WORKING_RES,
+          RAJA::Platform PLATFORM>
+void PluginForAllIdxSetTestImpl()
+{
+  SetupPluginVars spv(WORKING_RES::get_default());
+
+  CounterData* data = plugin_test_resource->allocate<CounterData>(10);
+
+  for (int i = 0; i < 10; i++) {
+
+    RAJA::TypedIndexSet< RAJA::RangeSegment > iset;
+
+    for (int j = i; j < 10; j++) {
+      iset.push_back(RAJA::RangeSegment(j, j+1));
+    }
+
+    RAJA::forall<RAJA::ExecPolicy<RAJA::seq_segit, ExecPolicy>>(
+      iset,
+      PluginTestCallable{data}
+    );
+
+    for (int j = i; j < 10; j++) {
+      CounterData loop_data;
+      plugin_test_resource->memcpy(&loop_data, &data[j], sizeof(CounterData));
+      ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
+      ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
+      ASSERT_EQ(loop_data.capture_counter_post,    i);
+      ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
+      ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
+      ASSERT_EQ(loop_data.launch_counter_post,    i);
+    }
+  }
+
+  CounterData plugin_data;
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
+  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+
+  plugin_test_resource->deallocate(data);
+}
+
+// test with IndexSet forall_Icount
+template <typename ExecPolicy,
+          typename WORKING_RES,
+          RAJA::Platform PLATFORM>
+void PluginForAllIcountIdxSetTestImpl()
+{
+  SetupPluginVars spv(WORKING_RES::get_default());
+
+  CounterData* data = plugin_test_resource->allocate<CounterData>(10);
+
+  for (int i = 0; i < 10; i++) {
+
+    RAJA::TypedIndexSet< RAJA::RangeSegment > iset;
+
+    for (int j = i; j < 10; j++) {
+      iset.push_back(RAJA::RangeSegment(j, j+1));
+    }
+
+    RAJA::forall_Icount<RAJA::ExecPolicy<RAJA::seq_segit, ExecPolicy>>(
+      iset,
+      PluginTestCallable{data}
+    );
+
+    for (int j = i; j < 10; j++) {
+      CounterData loop_data;
+      plugin_test_resource->memcpy(&loop_data, &data[j], sizeof(CounterData));
+      ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
+      ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
+      ASSERT_EQ(loop_data.capture_counter_post,    i);
+      ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
+      ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
+      ASSERT_EQ(loop_data.launch_counter_post,    i);
+    }
+  }
+
+  CounterData plugin_data;
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
+  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+
+  plugin_test_resource->deallocate(data);
+}
+
+TYPED_TEST_SUITE_P(PluginForallTest);
+template <typename T>
+class PluginForallTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(PluginForallTest, PluginForall)
+{
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  PluginForallTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
+}
+
+TYPED_TEST_P(PluginForallTest, PluginForAllICount)
+{
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  PluginForAllICountTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
+}
+
+TYPED_TEST_P(PluginForallTest, PluginForAllIdxSet)
+{
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  PluginForAllIdxSetTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
+}
+
+TYPED_TEST_P(PluginForallTest, PluginForAllIcountIdxSet)
+{
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  PluginForAllIcountIdxSetTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
+}
+
+REGISTER_TYPED_TEST_SUITE_P(PluginForallTest,
+                            PluginForall,
+                            PluginForAllICount,
+                            PluginForAllIdxSet,
+                            PluginForAllIcountIdxSet);
+
+#endif  //__TEST_PLUGIN_FORALL_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-kernel.hpp b/test/integration/plugin/tests/test-plugin-kernel.hpp
new file mode 100644
index 0000000000..c8abea1652
--- /dev/null
+++ b/test/integration/plugin/tests/test-plugin-kernel.hpp
@@ -0,0 +1,81 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing basic integration tests for plugins with kernel.
+///
+
+#ifndef __TEST_PLUGIN_KERNEL_HPP__
+#define __TEST_PLUGIN_KERNEL_HPP__
+
+#include "test-plugin.hpp"
+
+
+// Check that the plugin is called with the right Platform.
+// Check that the plugin is called the correct number of times,
+// once before and after each kernel capture for the capture counter,
+// once before and after each kernel invocation for the launch counter.
+
+// test with basic kernel
+template <typename KernelPolicy,
+          typename WORKING_RES,
+          RAJA::Platform PLATFORM>
+void PluginKernelTestImpl()
+{
+  SetupPluginVars spv(WORKING_RES::get_default());
+
+  CounterData* data = plugin_test_resource->allocate<CounterData>(10);
+
+  for (int i = 0; i < 10; i++) {
+
+    RAJA::kernel<KernelPolicy>(
+      RAJA::make_tuple(RAJA::RangeSegment(i,i+1)),
+      PluginTestCallable{data}
+    );
+
+    CounterData loop_data;
+    plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
+    ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
+    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
+    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.launch_counter_post,    i);
+  }
+
+  CounterData plugin_data;
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
+  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+
+  plugin_test_resource->deallocate(data);
+}
+
+
+TYPED_TEST_SUITE_P(PluginKernelTest);
+template <typename T>
+class PluginKernelTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(PluginKernelTest, PluginKernel)
+{
+  using KernelPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  PluginKernelTestImpl<KernelPolicy, ResType, PlatformHolder::platform>( );
+}
+
+REGISTER_TYPED_TEST_SUITE_P(PluginKernelTest,
+                            PluginKernel);
+
+#endif  //__TEST_PLUGIN_KERNEL_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-workgroup.hpp b/test/integration/plugin/tests/test-plugin-workgroup.hpp
new file mode 100644
index 0000000000..c9c1f5b739
--- /dev/null
+++ b/test/integration/plugin/tests/test-plugin-workgroup.hpp
@@ -0,0 +1,183 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing basic integration tests for plugins with workgroup.
+///
+
+#ifndef __TEST_PLUGIN_WORKGROUP_HPP__
+#define __TEST_PLUGIN_WORKGROUP_HPP__
+
+#include "test-plugin.hpp"
+
+
+// Check that the plugin is called with the right Platform.
+// Check that the plugin is called the correct number of times,
+// once before and after each enqueue capture for the capture counter,
+// once before and after each run invocation for the launch counter.
+
+// test with workgroup
+template <typename ExecPolicy,
+          typename OrderPolicy,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKINGRES,
+          RAJA::Platform PLATFORM>
+void PluginWorkGroupTestImpl()
+{
+  using WorkPool_type = RAJA::WorkPool<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkGroup_type = RAJA::WorkGroup<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkSite_type = RAJA::WorkSite<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  SetupPluginVars spv(WORKINGRES{});
+
+  CounterData* data = plugin_test_resource->allocate<CounterData>(10);
+
+  {
+    CounterData loop_data[10];
+    for (int i = 0; i < 10; i++) {
+      loop_data[i].capture_platform_active = RAJA::Platform::undefined;
+      loop_data[i].capture_counter_pre     = -1;
+      loop_data[i].capture_counter_post    = -1;
+      loop_data[i].launch_platform_active = RAJA::Platform::undefined;
+      loop_data[i].launch_counter_pre     = -1;
+      loop_data[i].launch_counter_post    = -1;
+    }
+    plugin_test_resource->memcpy(data, &loop_data[0], 10*sizeof(CounterData));
+  }
+
+  WorkPool_type pool(Allocator{});
+
+  for (int i = 0; i < 10; i++) {
+    pool.enqueue(RAJA::TypedRangeSegment<IndexType>{i,i+1},
+        PluginTestCallable{data});
+  }
+
+  {
+    CounterData plugin_data;
+    plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+    ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+    ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+    ASSERT_EQ(plugin_data.capture_counter_post,    10);
+    ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+    ASSERT_EQ(plugin_data.launch_counter_pre,     0);
+    ASSERT_EQ(plugin_data.launch_counter_post,    0);
+  }
+
+  {
+    CounterData loop_data[10];
+    plugin_test_resource->memcpy(&loop_data[0], data, 10*sizeof(CounterData));
+
+    for (int i = 0; i < 10; i++) {
+      ASSERT_EQ(loop_data[i].capture_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(loop_data[i].capture_counter_pre,     -1);
+      ASSERT_EQ(loop_data[i].capture_counter_post,    -1);
+      ASSERT_EQ(loop_data[i].launch_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(loop_data[i].launch_counter_pre,     -1);
+      ASSERT_EQ(loop_data[i].launch_counter_post,    -1);
+    }
+  }
+
+  WorkGroup_type group = pool.instantiate();
+
+  {
+    CounterData plugin_data;
+    plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+    ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+    ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+    ASSERT_EQ(plugin_data.capture_counter_post,    10);
+    ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+    ASSERT_EQ(plugin_data.launch_counter_pre,     0);
+    ASSERT_EQ(plugin_data.launch_counter_post,    0);
+  }
+
+  {
+    CounterData loop_data[10];
+    plugin_test_resource->memcpy(&loop_data[0], data, 10*sizeof(CounterData));
+
+    for (int i = 0; i < 10; i++) {
+      ASSERT_EQ(loop_data[i].capture_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(loop_data[i].capture_counter_pre,     -1);
+      ASSERT_EQ(loop_data[i].capture_counter_post,    -1);
+      ASSERT_EQ(loop_data[i].launch_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(loop_data[i].launch_counter_pre,     -1);
+      ASSERT_EQ(loop_data[i].launch_counter_post,    -1);
+    }
+  }
+
+  WorkSite_type site = group.run();
+
+  {
+    CounterData plugin_data;
+    plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+    ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+    ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+    ASSERT_EQ(plugin_data.capture_counter_post,    10);
+    ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+    ASSERT_EQ(plugin_data.launch_counter_pre,     1);
+    ASSERT_EQ(plugin_data.launch_counter_post,    1);
+  }
+
+  {
+    CounterData loop_data[10];
+    plugin_test_resource->memcpy(&loop_data, data, 10*sizeof(CounterData));
+
+    for (int i = 0; i < 10; i++) {
+      ASSERT_EQ(loop_data[i].capture_platform_active, PLATFORM);
+      ASSERT_EQ(loop_data[i].capture_counter_pre,     i+1);
+      ASSERT_EQ(loop_data[i].capture_counter_post,    i);
+      ASSERT_EQ(loop_data[i].launch_platform_active, PLATFORM);
+      ASSERT_EQ(loop_data[i].launch_counter_pre,     1);
+      ASSERT_EQ(loop_data[i].launch_counter_post,    0);
+    }
+  }
+
+  plugin_test_resource->deallocate(data);
+}
+
+
+TYPED_TEST_SUITE_P(PluginWorkGroupTest);
+template <typename T>
+class PluginWorkGroupTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(PluginWorkGroupTest, PluginWorkGroup)
+{
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<3>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<4>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<5>>::type;
+  using PlatformHolder = typename camp::at<TypeParam, camp::num<6>>::type;
+
+  PluginWorkGroupTestImpl<ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator, WORKING_RESOURCE, PlatformHolder::platform>( );
+}
+
+REGISTER_TYPED_TEST_SUITE_P(PluginWorkGroupTest,
+                            PluginWorkGroup);
+
+#endif  //__TEST_PLUGIN_WORKGROUP_HPP__
diff --git a/test/integration/plugin/tests/test-plugin.hpp b/test/integration/plugin/tests/test-plugin.hpp
new file mode 100644
index 0000000000..e5171fc824
--- /dev/null
+++ b/test/integration/plugin/tests/test-plugin.hpp
@@ -0,0 +1,162 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing basic functional tests for atomic operations with forall and views.
+///
+
+#ifndef __TEST_PLUGIN_HPP__
+#define __TEST_PLUGIN_HPP__
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA/util/macros.hpp"
+#include "counter.hpp"
+
+#include "gtest/gtest.h"
+
+
+CounterData* plugin_test_data = nullptr;
+
+camp::resources::Resource* plugin_test_resource = nullptr;
+
+struct SetupPluginVars
+{
+  SetupPluginVars(camp::resources::Resource const& test_resource)
+    : m_test_resource(test_resource)
+  {
+    // ASSERT_EQ(plugin_test_data, nullptr);
+    // ASSERT_EQ(plugin_test_resource, nullptr);
+
+    plugin_test_data = m_test_resource.allocate<CounterData>(1);
+    plugin_test_resource = &m_test_resource;
+
+    CounterData data;
+    data.capture_platform_active = RAJA::Platform::undefined;
+    data.capture_counter_pre     = 0;
+    data.capture_counter_post    = 0;
+    data.launch_platform_active = RAJA::Platform::undefined;
+    data.launch_counter_pre     = 0;
+    data.launch_counter_post    = 0;
+
+    m_test_resource.memcpy(plugin_test_data, &data, sizeof(CounterData));
+  }
+
+  SetupPluginVars(SetupPluginVars const&) = delete;
+  SetupPluginVars(SetupPluginVars &&) = delete;
+  SetupPluginVars& operator=(SetupPluginVars const&) = delete;
+  SetupPluginVars& operator=(SetupPluginVars &&) = delete;
+
+  ~SetupPluginVars()
+  {
+    // ASSERT_NE(plugin_test_data, nullptr);
+    // ASSERT_NE(plugin_test_resource, nullptr);
+
+    m_test_resource.deallocate(plugin_test_data);
+    plugin_test_data = nullptr;
+    plugin_test_resource = nullptr;
+  }
+
+private:
+  camp::resources::Resource m_test_resource;
+};
+
+
+struct PluginTestCallable
+{
+  PluginTestCallable(CounterData* data_optr)
+    : m_data_optr(data_optr)
+    , m_data_iptr(plugin_test_data)
+  {
+    clear_data();
+  }
+
+  RAJA_HOST_DEVICE PluginTestCallable(PluginTestCallable const& rhs)
+    : m_data_optr(rhs.m_data_optr)
+    , m_data_iptr(rhs.m_data_iptr)
+    , m_data(rhs.m_data)
+  {
+#if !defined(RAJA_DEVICE_CODE)
+    CounterData i_data;
+    plugin_test_resource->memcpy(&i_data, m_data_iptr, sizeof(CounterData));
+
+    if (m_data.capture_platform_active == RAJA::Platform::undefined &&
+        i_data.capture_platform_active != RAJA::Platform::undefined) {
+      m_data = i_data;
+    }
+#endif
+  }
+
+  RAJA_HOST_DEVICE PluginTestCallable(PluginTestCallable && rhs)
+    : m_data_optr(rhs.m_data_optr)
+    , m_data_iptr(rhs.m_data_iptr)
+    , m_data(rhs.m_data)
+  {
+    rhs.clear();
+  }
+
+  RAJA_HOST_DEVICE PluginTestCallable& operator=(PluginTestCallable const& rhs)
+  {
+    if (this != &rhs) {
+      m_data_optr = rhs.m_data_optr;
+      m_data_iptr = rhs.m_data_iptr;
+      m_data      = rhs.m_data;
+    }
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE PluginTestCallable& operator=(PluginTestCallable && rhs)
+  {
+    if (this != &rhs) {
+      m_data_optr = rhs.m_data_optr;
+      m_data_iptr = rhs.m_data_iptr;
+      m_data      = rhs.m_data;
+      rhs.clear();
+    }
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE void operator()(int i) const
+  {
+    m_data_optr[i].capture_platform_active = m_data.capture_platform_active;
+    m_data_optr[i].capture_counter_pre     = m_data.capture_counter_pre;
+    m_data_optr[i].capture_counter_post    = m_data.capture_counter_post;
+    m_data_optr[i].launch_platform_active = m_data_iptr->launch_platform_active;
+    m_data_optr[i].launch_counter_pre     = m_data_iptr->launch_counter_pre;
+    m_data_optr[i].launch_counter_post    = m_data_iptr->launch_counter_post;
+  }
+
+  RAJA_HOST_DEVICE void operator()(int count, int i) const
+  {
+    RAJA_UNUSED_VAR(count);
+    operator()(i);
+  }
+
+private:
+        CounterData* m_data_optr = nullptr;
+  const CounterData* m_data_iptr = nullptr;
+        CounterData  m_data;
+
+
+  RAJA_HOST_DEVICE void clear()
+  {
+    m_data_optr = nullptr;
+    m_data_iptr = nullptr;
+    clear_data();
+  }
+
+  RAJA_HOST_DEVICE void clear_data()
+  {
+    m_data.capture_platform_active = RAJA::Platform::undefined;
+    m_data.capture_counter_pre     = -1;
+    m_data.capture_counter_post    = -1;
+    m_data.launch_platform_active = RAJA::Platform::undefined;
+    m_data.launch_counter_pre     = -1;
+    m_data.launch_counter_post    = -1;
+  }
+};
+
+#endif  //__TEST_PLUGIN_HPP__
diff --git a/test/integration/plugin_for_test.cpp b/test/integration/plugin_for_test_dynamic.cpp
similarity index 53%
rename from test/integration/plugin_for_test.cpp
rename to test/integration/plugin_for_test_dynamic.cpp
index d1fdfc9e61..f12ba3776b 100644
--- a/test/integration/plugin_for_test.cpp
+++ b/test/integration/plugin_for_test_dynamic.cpp
@@ -6,22 +6,18 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 #include "RAJA/util/PluginStrategy.hpp"
 
-#include <iostream>
+#include <exception>
 
-#include "counter.hpp"
-
-class CounterPlugin :
+class ExceptionPlugin :
   public RAJA::util::PluginStrategy
 {
   public:
-  void preLaunch(RAJA::util::PluginContext RAJA_UNUSED_ARG(p)) {
-    plugin_test_counter_pre++;
-  }
-
-  void postLaunch(RAJA::util::PluginContext RAJA_UNUSED_ARG(p)) {
-    plugin_test_counter_post++;
+  void preLaunch(const RAJA::util::PluginContext& RAJA_UNUSED_ARG(p)) override {
+    throw std::runtime_error("preLaunch");
   }
 };
 
-// Regiser plugin with the PluginRegistry
-static RAJA::util::PluginRegistry::Add<CounterPlugin> P("counter-plugin", "Counter");
+extern "C" RAJA::util::PluginStrategy *getPlugin()
+{
+  return new ExceptionPlugin;
+}
diff --git a/test/integration/plugin_for_test_kokkos.cpp b/test/integration/plugin_for_test_kokkos.cpp
new file mode 100644
index 0000000000..81b5b56b0e
--- /dev/null
+++ b/test/integration/plugin_for_test_kokkos.cpp
@@ -0,0 +1,25 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "RAJA/RAJA.hpp"
+
+#include <exception>
+
+extern "C" void kokkosp_init_library(const int RAJA_UNUSED_ARG(loadSeq),
+	const uint64_t RAJA_UNUSED_ARG(interfaceVer),
+	const uint32_t RAJA_UNUSED_ARG(devInfoCount),
+	void* RAJA_UNUSED_ARG(deviceInfo)) {}
+
+extern "C" void kokkosp_begin_parallel_for(const char* RAJA_UNUSED_ARG(name),
+    const uint32_t RAJA_UNUSED_ARG(devID),
+    uint64_t* RAJA_UNUSED_ARG(kID)) {
+    throw std::runtime_error("preLaunch");
+}
+
+extern "C" void kokkosp_end_parallel_for(const uint64_t RAJA_UNUSED_ARG(kID)) {}
+
+extern "C" void kokkosp_finalize_library() {}
diff --git a/test/integration/test_plugin.cpp b/test/integration/test_plugin.cpp
deleted file mode 100644
index d68249e6b6..0000000000
--- a/test/integration/test_plugin.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-#include "RAJA/RAJA.hpp"
-#include "counter.hpp"
-
-#include "gtest/gtest.h"
-
-int plugin_test_counter_pre{0};
-int plugin_test_counter_post{0};
-
-// Check that the plugin is called the correct number of times, once before and
-// after each kernel invocation
-TEST(PluginTest, Counter)
-{
-  int* a = new int[10];
-
-  for (int i = 0; i < 10; i++) {
-    RAJA::forall<RAJA::seq_exec>(
-      RAJA::RangeSegment(0,10), 
-      [=] (int i) {
-        a[i] = 0;
-    });
-  }
-
-  ASSERT_EQ(plugin_test_counter_pre, 10);
-  ASSERT_EQ(plugin_test_counter_post, 10);
-
-  delete[] a;
-}
diff --git a/test/integration/test_plugin_dynamic.cpp b/test/integration/test_plugin_dynamic.cpp
new file mode 100644
index 0000000000..44734e7b76
--- /dev/null
+++ b/test/integration/test_plugin_dynamic.cpp
@@ -0,0 +1,21 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+#include "RAJA/RAJA.hpp"
+#include "gtest/gtest.h"
+
+TEST(PluginTestDynamic, Exception)
+{
+  RAJA::util::init_plugins("../../lib/libdynamic_plugin.so");
+  int* a = new int[10];
+
+  ASSERT_ANY_THROW({
+    RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10),
+                               [=](int i) { a[i] = 0; });
+  });
+
+  delete[] a;
+}
diff --git a/test/integration/counter.hpp b/test/integration/test_plugin_kokkos.cpp
similarity index 56%
rename from test/integration/counter.hpp
rename to test/integration/test_plugin_kokkos.cpp
index 62483b1edc..96e60041bf 100644
--- a/test/integration/counter.hpp
+++ b/test/integration/test_plugin_kokkos.cpp
@@ -4,10 +4,18 @@
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-#ifndef  RAJA_counter_HPP
-#define  RAJA_counter_HPP
 
-extern int plugin_test_counter_pre;
-extern int plugin_test_counter_post;
+#include "RAJA/RAJA.hpp"
+#include "gtest/gtest.h"
 
-#endif  // RAJA_counter_HPP
+TEST(PluginTestKokkos, Exception)
+{
+  int* a = new int[10];
+
+  ASSERT_ANY_THROW({
+    RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10),
+                               [=](int i) { a[i] = 0; });
+  });
+
+  delete[] a;
+}
diff --git a/test/old-tests/CMakeLists.txt b/test/old-tests/CMakeLists.txt
new file mode 100644
index 0000000000..7fe40a9e89
--- /dev/null
+++ b/test/old-tests/CMakeLists.txt
@@ -0,0 +1,17 @@
+###############################################################################
+#
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC.
+#
+# Produced at the Lawrence Livermore National Laboratory
+#
+# LLNL-CODE-689114
+#
+# All rights reserved.
+#
+# This file is part of RAJA.
+#
+# For details about use and distribution, please read RAJA/LICENSE.
+#
+###############################################################################
+
+add_subdirectory(unit)
diff --git a/test/old-tests/unit/CMakeLists.txt b/test/old-tests/unit/CMakeLists.txt
new file mode 100644
index 0000000000..a5cd26550d
--- /dev/null
+++ b/test/old-tests/unit/CMakeLists.txt
@@ -0,0 +1,36 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+raja_add_test(
+  NAME test-kernel-lambda-args
+  SOURCES test-kernel-lambda-args.cpp)
+
+raja_add_test(
+  NAME test-kernel-dynamic-tile
+  SOURCES test-kernel-dynamic-tile.cpp)
+
+raja_add_test(
+  NAME test-kernel
+  SOURCES test-kernel.cpp)
+
+raja_add_test(
+  NAME test-sharedmem
+  SOURCES test-sharedmem.cpp)
+
+raja_add_test(
+  NAME test-simd
+  SOURCES test-simd.cpp)
+
+add_subdirectory(cpu)
+
+if(ENABLE_CUDA)
+  add_subdirectory(cuda)
+endif(ENABLE_CUDA)
+
+if(ENABLE_TARGET_OPENMP)
+  add_subdirectory(omp-target)
+endif(ENABLE_TARGET_OPENMP)
diff --git a/test/old-tests/unit/cpu/CMakeLists.txt b/test/old-tests/unit/cpu/CMakeLists.txt
new file mode 100644
index 0000000000..6fa6fee258
--- /dev/null
+++ b/test/old-tests/unit/cpu/CMakeLists.txt
@@ -0,0 +1,16 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+raja_add_test(
+  NAME test-synchronize
+  SOURCES test-synchronize.cpp)
+
+if(NOT RAJA_ENABLE_TARGET_OPENMP)
+  raja_add_test(
+    NAME test-reductions
+    SOURCES test-reductions.cpp)
+endif()
diff --git a/test/unit/cpu/test-reductions.cpp b/test/old-tests/unit/cpu/test-reductions.cpp
similarity index 57%
rename from test/unit/cpu/test-reductions.cpp
rename to test/old-tests/unit/cpu/test-reductions.cpp
index ff3805989c..64355d75f1 100644
--- a/test/unit/cpu/test-reductions.cpp
+++ b/test/old-tests/unit/cpu/test-reductions.cpp
@@ -19,109 +19,6 @@
 
 #include <math.h>
 
-template <typename T>
-class ReductionConstructorTest : public ::testing::Test
-{
-};
-
-template <typename T>
-class ReductionConstructorTest2 : public ::testing::Test
-{
-};
-
-TYPED_TEST_SUITE_P(ReductionConstructorTest);
-TYPED_TEST_SUITE_P(ReductionConstructorTest2);
-
-TYPED_TEST_P(ReductionConstructorTest, ReductionConstructor)
-{
-  using ReducePolicy = typename std::tuple_element<0, TypeParam>::type;
-  using NumericType = typename std::tuple_element<1, TypeParam>::type;
-
-  RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum(0.0);
-  RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min(0.0);
-  RAJA::ReduceMax<ReducePolicy, NumericType> reduce_max(0.0);
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType> reduce_minloc(0.0, 1);
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(0.0, 1);
-
-  RAJA::tuple<RAJA::Index_type, RAJA::Index_type> LocTup(1, 1);
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_minloctup(0.0, LocTup);
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_maxloctup(0.0, LocTup);
-
-  ASSERT_EQ((NumericType)reduce_sum.get(), (NumericType)(0.0));
-  ASSERT_EQ((NumericType)reduce_min.get(), (NumericType)(0.0));
-  ASSERT_EQ((NumericType)reduce_max.get(), (NumericType)(0.0));
-  ASSERT_EQ((RAJA::Index_type)reduce_minloc.getLoc(), (RAJA::Index_type)1);
-  ASSERT_EQ((NumericType)reduce_minloc.get(), (NumericType)(0.0));
-  ASSERT_EQ((NumericType)reduce_maxloc.get(), (NumericType)(0.0));
-  ASSERT_EQ((RAJA::Index_type)reduce_maxloc.getLoc(), (RAJA::Index_type)1);
-
-  ASSERT_EQ((NumericType)reduce_minloctup.get(), (NumericType)(0.0));
-  ASSERT_EQ((NumericType)reduce_maxloctup.get(), (NumericType)(0.0));
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())), (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())), (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())), (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())), (RAJA::Index_type)1);
-}
-
-TYPED_TEST_P(ReductionConstructorTest, ReductionConstructor2)
-{
-  using ReducePolicy = typename std::tuple_element<0, TypeParam>::type;
-  using NumericType = typename std::tuple_element<1, TypeParam>::type;
-
-  RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum;
-  RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min;
-  RAJA::ReduceMax<ReducePolicy, NumericType> reduce_max;
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType> reduce_minloc;
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc;
-
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_minloctup;
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_maxloctup;
-
-  ASSERT_EQ((NumericType)reduce_sum.get(), NumericType());
-  ASSERT_EQ((NumericType)reduce_min.get(), NumericType());
-  ASSERT_EQ((NumericType)reduce_max.get(), NumericType());
-  ASSERT_EQ((RAJA::Index_type)reduce_minloc.getLoc(), RAJA::Index_type());
-  ASSERT_EQ((NumericType)reduce_minloc.get(), NumericType());
-  ASSERT_EQ((NumericType)reduce_maxloc.get(), NumericType());
-  ASSERT_EQ((RAJA::Index_type)reduce_maxloc.getLoc(), RAJA::Index_type());
-
-  ASSERT_EQ((NumericType)reduce_minloctup.get(), NumericType());
-  ASSERT_EQ((NumericType)reduce_maxloctup.get(), NumericType());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())), RAJA::Index_type());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())), RAJA::Index_type());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())), RAJA::Index_type());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())), RAJA::Index_type());
-}
-
-REGISTER_TYPED_TEST_SUITE_P(ReductionConstructorTest,
-                           ReductionConstructor,
-                           ReductionConstructor2);
-
-using constructor_types =
-    ::testing::Types<std::tuple<RAJA::seq_reduce, int>,
-                     std::tuple<RAJA::seq_reduce, float>,
-                     std::tuple<RAJA::seq_reduce, double>
-#if defined(RAJA_ENABLE_TBB)
-                     ,
-                     std::tuple<RAJA::tbb_reduce, int>,
-                     std::tuple<RAJA::tbb_reduce, float>,
-                     std::tuple<RAJA::tbb_reduce, double>
-#endif
-#if defined(RAJA_ENABLE_OPENMP)
-                     ,
-                     std::tuple<RAJA::omp_reduce, int>,
-                     std::tuple<RAJA::omp_reduce, float>,
-                     std::tuple<RAJA::omp_reduce, double>,
-                     std::tuple<RAJA::omp_reduce_ordered, int>,
-                     std::tuple<RAJA::omp_reduce_ordered, float>,
-                     std::tuple<RAJA::omp_reduce_ordered, double>
-#endif
-                     >;
-
-INSTANTIATE_TYPED_TEST_SUITE_P(ReduceBasicTests,
-                              ReductionConstructorTest,
-                              constructor_types);
-
 template <typename TUPLE>
 class ReductionCorrectnessTest : public ::testing::Test
 {
@@ -260,130 +157,6 @@ class ReductionGenericLocTest : public ::testing::Test
 };
 TYPED_TEST_SUITE_P(ReductionGenericLocTest);
 
-TYPED_TEST_P(ReductionCorrectnessTest, ReduceSum)
-{
-  using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
-  using ReducePolicy = typename std::tuple_element<1, TypeParam>::type;
-  // using NumericType = typename std::tuple_element<2, TypeParam>::type;
-
-  RAJA::ReduceSum<ReducePolicy, double> sum_reducer(0.0);
-
-  RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
-                           [=](int i) { sum_reducer += this->array[i]; });
-
-  double raja_sum = (double)sum_reducer.get();
-
-  ASSERT_FLOAT_EQ(this->sum, raja_sum);
-}
-
-TYPED_TEST_P(ReductionCorrectnessTest, ReduceSum2)
-{
-  using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
-  using ReducePolicy = typename std::tuple_element<1, TypeParam>::type;
-  // using NumericType = typename std::tuple_element<2, TypeParam>::type;
-
-  RAJA::ReduceSum<ReducePolicy, double> sum_reducer;
-
-  sum_reducer.reset(5.0);
-  sum_reducer.reset(0.0);  // reset the value
-
-  RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
-                           [=](int i) { sum_reducer += this->array[i]; });
-
-  double raja_sum = (double)sum_reducer.get();
-
-  ASSERT_FLOAT_EQ(this->sum, raja_sum);
-}
-
-TYPED_TEST_P(ReductionCorrectnessTest, ReduceMin)
-{
-  using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
-  using ReducePolicy = typename std::tuple_element<1, TypeParam>::type;
-  // using NumericType = typename std::tuple_element<2, TypeParam>::type;
-
-  RAJA::ReduceMin<ReducePolicy, double> min_reducer(1024.0);
-
-  RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
-                           [=](int i) { min_reducer.min(this->array[i]); });
-
-  double raja_min = (double)min_reducer.get();
-
-  ASSERT_FLOAT_EQ(this->min, raja_min);
-}
-
-
-TYPED_TEST_P(ReductionCorrectnessTest, ReduceMin2)
-{
-  using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
-  using ReducePolicy = typename std::tuple_element<1, TypeParam>::type;
-  // using NumericType = typename std::tuple_element<2, TypeParam>::type;
-
-  RAJA::ReduceMin<ReducePolicy, double> min_reducer;
-
-  min_reducer.reset(1024.0);
-  RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
-                           [=](int i) { min_reducer.min(this->array[i]); });
-
-  double raja_min = (double)min_reducer.get();
-
-  ASSERT_FLOAT_EQ(this->min, raja_min);
-}
-
-TYPED_TEST_P(ReductionCorrectnessTest, ReduceMax)
-{
-  using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
-  using ReducePolicy = typename std::tuple_element<1, TypeParam>::type;
-  // using NumericType = typename std::tuple_element<2, TypeParam>::type;
-
-  RAJA::ReduceMax<ReducePolicy, double> max_reducer(0.0);
-
-  RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
-                           [=](int i) { max_reducer.max(this->array[i]); });
-
-  double raja_max = (double)max_reducer.get();
-
-  ASSERT_FLOAT_EQ(this->max, raja_max);
-}
-
-TYPED_TEST_P(ReductionCorrectnessTest, ReduceMax2)
-{
-  using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
-  using ReducePolicy = typename std::tuple_element<1, TypeParam>::type;
-  // using NumericType = typename std::tuple_element<2, TypeParam>::type;
-
-  RAJA::ReduceMax<ReducePolicy, double> max_reducer;
-
-  max_reducer.reset(5.0);
-  max_reducer.reset(0.0);
-
-  RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
-                           [=](int i) { max_reducer.max(this->array[i]); });
-
-  double raja_max = (double)max_reducer.get();
-
-  ASSERT_FLOAT_EQ(this->max, raja_max);
-}
-
-TYPED_TEST_P(ReductionCorrectnessTest, ReduceMinLoc)
-{
-  using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
-  using ReducePolicy = typename std::tuple_element<1, TypeParam>::type;
-  // using NumericType = typename std::tuple_element<2, TypeParam>::type;
-
-  RAJA::ReduceMinLoc<ReducePolicy, double> minloc_reducer(1024.0, 0);
-
-  RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
-                           [=](int i) {
-                             minloc_reducer.minloc(this->array[i], i);
-                           });
-
-  RAJA::Index_type raja_loc = minloc_reducer.getLoc();
-  double raja_min = (double)minloc_reducer.get();
-
-  ASSERT_FLOAT_EQ(this->min, raja_min);
-  ASSERT_EQ(this->minloc, raja_loc);
-}
-
 TYPED_TEST_P(ReductionCorrectnessTest, ReduceMinLocGenericIndex)
 {
   using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
@@ -406,7 +179,7 @@ TYPED_TEST_P(ReductionCorrectnessTest, ReduceMinLocGenericIndex)
   Index raja_loc = minloc_reducer.getLoc();
   double raja_min = (double)minloc_reducer.get();
 
-  ASSERT_FLOAT_EQ(this->min, raja_min);
+  ASSERT_DOUBLE_EQ((double)this->min, (double)raja_min);
   ASSERT_EQ(this->minloc, raja_loc.idx);
 }
 
@@ -428,7 +201,7 @@ TYPED_TEST_P(ReductionGenericLocTest, ReduceMinLoc2DIndex)
     Index2D(RAJA::Index_type idarray) : idarray(idarray)
     {
       idx = idarray % 10;
-      idy = floor( idarray / 10 );
+      idy = (RAJA::Index_type) floor( idarray / 10 );
     }
   };
 
@@ -442,7 +215,7 @@ TYPED_TEST_P(ReductionGenericLocTest, ReduceMinLoc2DIndex)
   Index2D raja_loc = minloc_reducer.getLoc();
   double raja_min = (double)minloc_reducer.get();
 
-  ASSERT_FLOAT_EQ(this->min, raja_min);
+  ASSERT_DOUBLE_EQ((double)this->min, (double)raja_min);
   ASSERT_EQ(this->minlocx, raja_loc.idx);
   ASSERT_EQ(this->minlocy, raja_loc.idy);
 }
@@ -479,7 +252,7 @@ TYPED_TEST_P(ReductionGenericLocTest, ReduceMinLoc2DIndexKernel)
   Index2D raja_loc = minloc_reducer.getLoc();
   double raja_min = (double)minloc_reducer.get();
 
-  ASSERT_FLOAT_EQ(this->min, raja_min);
+  ASSERT_DOUBLE_EQ((double)this->min, (double)raja_min);
   ASSERT_EQ(this->minlocx, raja_loc.idx);
   ASSERT_EQ(this->minlocy, raja_loc.idy);
 }
@@ -518,7 +291,7 @@ TYPED_TEST_P(ReductionGenericLocTest, ReduceMinLoc2DIndexViewKernel)
   Index2D raja_loc = minloc_reducer.getLoc();
   double raja_min = (double)minloc_reducer.get();
 
-  ASSERT_FLOAT_EQ(this->min, raja_min);
+  ASSERT_DOUBLE_EQ((double)this->min, (double)raja_min);
   ASSERT_EQ(this->minlocx, raja_loc.idx);
   ASSERT_EQ(this->minlocy, raja_loc.idy);
 }
@@ -553,7 +326,7 @@ TYPED_TEST_P(ReductionGenericLocTest, ReduceMinLoc2DIndexTupleViewKernel)
   RAJA::tuple<int, int> raja_loc = minloc_reducer.getLoc();
   double raja_min = (double)minloc_reducer.get();
 
-  ASSERT_FLOAT_EQ(this->min, raja_min);
+  ASSERT_DOUBLE_EQ((double)this->min, (double)raja_min);
   ASSERT_EQ(this->minlocx, RAJA::get<0>(raja_loc));
   ASSERT_EQ(this->minlocy, RAJA::get<1>(raja_loc));
 }
@@ -576,7 +349,7 @@ TYPED_TEST_P(ReductionGenericLocTest, ReduceMaxLoc2DIndex)
     Index2D(RAJA::Index_type idarray) : idarray(idarray)
     {
       idx = idarray % 10;
-      idy = floor( idarray / 10 );
+      idy = idarray / 10;
     }
   };
 
@@ -590,7 +363,7 @@ TYPED_TEST_P(ReductionGenericLocTest, ReduceMaxLoc2DIndex)
   Index2D raja_loc = maxloc_reducer.getLoc();
   double raja_max = (double)maxloc_reducer.get();
 
-  ASSERT_FLOAT_EQ(this->max, raja_max);
+  ASSERT_DOUBLE_EQ((double)this->max, (double)raja_max);
   ASSERT_EQ(this->maxlocx, raja_loc.idx);
   ASSERT_EQ(this->maxlocy, raja_loc.idy);
 }
@@ -627,7 +400,7 @@ TYPED_TEST_P(ReductionGenericLocTest, ReduceMaxLoc2DIndexKernel)
   Index2D raja_loc = maxloc_reducer.getLoc();
   double raja_max = (double)maxloc_reducer.get();
 
-  ASSERT_FLOAT_EQ(this->max, raja_max);
+  ASSERT_DOUBLE_EQ((double)this->max, (double)raja_max);
   ASSERT_EQ(this->maxlocx, raja_loc.idx);
   ASSERT_EQ(this->maxlocy, raja_loc.idy);
 }
@@ -666,7 +439,7 @@ TYPED_TEST_P(ReductionGenericLocTest, ReduceMaxLoc2DIndexViewKernel)
   Index2D raja_loc = maxloc_reducer.getLoc();
   double raja_max = (double)maxloc_reducer.get();
 
-  ASSERT_FLOAT_EQ(this->max, raja_max);
+  ASSERT_DOUBLE_EQ((double)this->max, (double)raja_max);
   ASSERT_EQ(this->maxlocx, raja_loc.idx);
   ASSERT_EQ(this->maxlocy, raja_loc.idy);
 }
@@ -701,36 +474,11 @@ TYPED_TEST_P(ReductionGenericLocTest, ReduceMaxLoc2DIndexTupleViewKernel)
   RAJA::tuple<int, int> raja_loc = maxloc_reducer.getLoc();
   double raja_max = (double)maxloc_reducer.get();
 
-  ASSERT_FLOAT_EQ(this->max, raja_max);
+  ASSERT_DOUBLE_EQ((double)this->max, (double)raja_max);
   ASSERT_EQ(this->maxlocx, RAJA::get<0>(raja_loc));
   ASSERT_EQ(this->maxlocy, RAJA::get<1>(raja_loc));
 }
 
-TYPED_TEST_P(ReductionCorrectnessTest, ReduceMinLoc2)
-{
-  using ExecPolicy =
-      RAJA::seq_exec;  // typename std::tuple_element<0, TypeParam>::type;
-  using ReducePolicy =
-      RAJA::seq_reduce;  // typename std::tuple_element<1, TypeParam>::type;
-  // using NumericType = typename std::tuple_element<2, TypeParam>::type;
-
-
-  RAJA::ReduceMinLoc<ReducePolicy, double> minloc_reducer;
-
-  minloc_reducer.reset(1024.0, 0);
-
-  RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
-                           [=](int i) {
-                             minloc_reducer.minloc(this->array[i], i);
-                           });
-
-  RAJA::Index_type raja_loc = minloc_reducer.getLoc();
-  double raja_min = (double)minloc_reducer.get();
-
-  ASSERT_FLOAT_EQ(this->min, raja_min);
-  ASSERT_EQ(this->minloc, raja_loc);
-}
-
 TYPED_TEST_P(ReductionCorrectnessTest, ReduceMinLocGenericIndex2)
 {
   using ExecPolicy =
@@ -747,7 +495,7 @@ TYPED_TEST_P(ReductionCorrectnessTest, ReduceMinLocGenericIndex2)
 
   RAJA::ReduceMinLoc<ReducePolicy, double, Index> minloc_reducer;
 
-  minloc_reducer.reset({1024.0, Index(0)});
+  minloc_reducer.reset(1024.0, Index(0));
 
   RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
                            [=](int i) {
@@ -757,30 +505,10 @@ TYPED_TEST_P(ReductionCorrectnessTest, ReduceMinLocGenericIndex2)
   Index raja_loc = minloc_reducer.getLoc();
   double raja_min = (double)minloc_reducer.get();
 
-  ASSERT_FLOAT_EQ(this->min, raja_min);
+  ASSERT_DOUBLE_EQ((double)this->min, (double)raja_min);
   ASSERT_EQ(this->minloc, raja_loc.idx);
 }
 
-TYPED_TEST_P(ReductionCorrectnessTest, ReduceMaxLoc)
-{
-  using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
-  using ReducePolicy = typename std::tuple_element<1, TypeParam>::type;
-  // using NumericType = typename std::tuple_element<2, TypeParam>::type;
-
-  RAJA::ReduceMaxLoc<ReducePolicy, double> maxloc_reducer(0.0, -1);
-
-  RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
-                           [=](int i) {
-                             maxloc_reducer.maxloc(this->array[i], i);
-                           });
-
-  RAJA::Index_type raja_loc = maxloc_reducer.getLoc();
-  double raja_max = (double)maxloc_reducer.get();
-
-  ASSERT_FLOAT_EQ(this->max, raja_max);
-  ASSERT_EQ(this->maxloc, raja_loc);
-}
-
 TYPED_TEST_P(ReductionCorrectnessTest, ReduceMaxLocGenericIndex)
 {
   using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
@@ -803,32 +531,10 @@ TYPED_TEST_P(ReductionCorrectnessTest, ReduceMaxLocGenericIndex)
   Index raja_loc = maxloc_reducer.getLoc();
   double raja_max = (double)maxloc_reducer.get();
 
-  ASSERT_FLOAT_EQ(this->max, raja_max);
+  ASSERT_DOUBLE_EQ((double)this->max, (double)raja_max);
   ASSERT_EQ(this->maxloc, raja_loc.idx);
 }
 
-TYPED_TEST_P(ReductionCorrectnessTest, ReduceMaxLoc2)
-{
-  using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
-  using ReducePolicy = typename std::tuple_element<1, TypeParam>::type;
-  // using NumericType = typename std::tuple_element<2, TypeParam>::type;
-
-  RAJA::ReduceMaxLoc<ReducePolicy, double> maxloc_reducer;
-
-  maxloc_reducer.reset(0.0, -1);
-
-  RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
-                           [=](int i) {
-                             maxloc_reducer.maxloc(this->array[i], i);
-                           });
-
-  RAJA::Index_type raja_loc = maxloc_reducer.getLoc();
-  double raja_max = (double)maxloc_reducer.get();
-
-  ASSERT_FLOAT_EQ(this->max, raja_max);
-  ASSERT_EQ(this->maxloc, raja_loc);
-}
-
 TYPED_TEST_P(ReductionCorrectnessTest, ReduceMaxLocGenericIndex2)
 {
   using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
@@ -843,7 +549,7 @@ TYPED_TEST_P(ReductionCorrectnessTest, ReduceMaxLocGenericIndex2)
 
   RAJA::ReduceMaxLoc<ReducePolicy, double, Index> maxloc_reducer;
 
-  maxloc_reducer.reset({0.0, Index()});
+  maxloc_reducer.reset(0.0, Index());
 
   RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
                            [=](int i) {
@@ -853,24 +559,14 @@ TYPED_TEST_P(ReductionCorrectnessTest, ReduceMaxLocGenericIndex2)
   Index raja_loc = maxloc_reducer.getLoc();
   double raja_max = (double)maxloc_reducer.get();
 
-  ASSERT_FLOAT_EQ(this->max, raja_max);
+  ASSERT_DOUBLE_EQ((double)this->max, (double)raja_max);
   ASSERT_EQ(this->maxloc, raja_loc.idx);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ReductionCorrectnessTest,
-                           ReduceSum,
-                           ReduceSum2,
-                           ReduceMin,
-                           ReduceMin2,
-                           ReduceMax,
-                           ReduceMax2,
-                           ReduceMinLoc,
                            ReduceMinLocGenericIndex,
-                           ReduceMinLoc2,
                            ReduceMinLocGenericIndex2,
-                           ReduceMaxLoc,
                            ReduceMaxLocGenericIndex,
-                           ReduceMaxLoc2,
                            ReduceMaxLocGenericIndex2);
 
 REGISTER_TYPED_TEST_SUITE_P(ReductionGenericLocTest,
diff --git a/test/unit/cpu/test-synchronize.cpp b/test/old-tests/unit/cpu/test-synchronize.cpp
similarity index 100%
rename from test/unit/cpu/test-synchronize.cpp
rename to test/old-tests/unit/cpu/test-synchronize.cpp
diff --git a/test/unit/cuda/CMakeLists.txt b/test/old-tests/unit/cuda/CMakeLists.txt
similarity index 60%
rename from test/unit/cuda/CMakeLists.txt
rename to test/old-tests/unit/cuda/CMakeLists.txt
index 738f0cb797..1eb7b82100 100644
--- a/test/unit/cuda/CMakeLists.txt
+++ b/test/old-tests/unit/cuda/CMakeLists.txt
@@ -5,22 +5,10 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-raja_add_test(
-  NAME test-cuda-scan
-  SOURCES test-scan.cpp)
-
 raja_add_test(
   NAME test-cuda-reduce-sum
   SOURCES test-reduce-sum.cpp)
 
-raja_add_test(
-  NAME test-cuda-reduce-min
-  SOURCES test-reduce-min.cpp)
-
-raja_add_test(
-  NAME test-cuda-reduce-loc
-  SOURCES test-reduce-loc.cpp)
-
 raja_add_test(
   NAME test-cuda-reduce-tupleloc
   SOURCES test-reduce-tupleloc.cpp)
@@ -29,18 +17,6 @@ raja_add_test(
   NAME test-cuda-reduce-randloc
   SOURCES test-reduce-randloc.cpp)
 
-raja_add_test(
-  NAME test-cuda-reduce-max
-  SOURCES test-reduce-max.cpp)
-
-raja_add_test(
-  NAME test-cuda-forall
-  SOURCES test-forall.cpp)
-
-raja_add_test(
-  NAME test-cuda-forall-view
-  SOURCES test-forall-view.cpp)
-
 raja_add_test(
   NAME test-cuda-synchronize
   SOURCES test-synchronize.cpp)
diff --git a/test/unit/cuda/test-reduce-randloc.cpp b/test/old-tests/unit/cuda/test-reduce-randloc.cpp
similarity index 100%
rename from test/unit/cuda/test-reduce-randloc.cpp
rename to test/old-tests/unit/cuda/test-reduce-randloc.cpp
diff --git a/test/old-tests/unit/cuda/test-reduce-sum.cpp b/test/old-tests/unit/cuda/test-reduce-sum.cpp
new file mode 100644
index 0000000000..926f7f4ffd
--- /dev/null
+++ b/test/old-tests/unit/cuda/test-reduce-sum.cpp
@@ -0,0 +1,132 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA GPU reductions.
+///
+
+#include <math.h>
+#include <cfloat>
+#include <cstdio>
+#include <iomanip>
+#include <iostream>
+#include <random>
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA_gtest.hpp"
+
+using UnitIndexSet = RAJA::TypedIndexSet<RAJA::RangeSegment,
+                                         RAJA::RangeStrideSegment>;
+
+constexpr const int TEST_VEC_LEN = 1024 * 1024 * 5;
+
+using namespace RAJA;
+
+static const double dinit_val = 0.1;
+static const int iinit_val = 1;
+
+class ReduceSumCUDA : public ::testing::Test
+{
+public:
+  static void SetUpTestCase()
+  {
+
+    cudaErrchk(cudaMallocManaged((void**)&dvalue,
+                      sizeof(double) * TEST_VEC_LEN,
+                      cudaMemAttachGlobal));
+
+    for (int i = 0; i < TEST_VEC_LEN; ++i) {
+      dvalue[i] = dinit_val;
+    }
+
+    cudaErrchk(cudaMallocManaged((void**)&ivalue,
+                      sizeof(int) * TEST_VEC_LEN,
+                      cudaMemAttachGlobal));
+
+    for (int i = 0; i < TEST_VEC_LEN; ++i) {
+      ivalue[i] = iinit_val;
+    }
+
+    cudaErrchk(cudaMallocManaged((void**)&rand_dvalue,
+                      sizeof(double) * TEST_VEC_LEN,
+                      cudaMemAttachGlobal));
+  }
+
+  static void TearDownTestCase()
+  {
+    cudaErrchk(cudaFree(dvalue));
+    cudaErrchk(cudaFree(rand_dvalue));
+    cudaErrchk(cudaFree(ivalue));
+  }
+
+  static double* dvalue;
+  static double* rand_dvalue;
+  static int* ivalue;
+};
+
+double* ReduceSumCUDA::dvalue = nullptr;
+double* ReduceSumCUDA::rand_dvalue = nullptr;
+int* ReduceSumCUDA::ivalue = nullptr;
+
+const size_t block_size = 256;
+
+GPU_TEST_F(ReduceSumCUDA, atomic_reduce)
+{
+  double* rand_dvalue = ReduceSumCUDA::rand_dvalue;
+
+  ReduceSum<cuda_reduce_atomic, double> dsumN(0.0);
+  ReduceSum<cuda_reduce_atomic, double> dsumP(0.0);
+
+  double neg_chk_val = 0.0;
+  double pos_chk_val = 0.0;
+
+  int loops = 3;
+
+  for (int k = 0; k < loops; k++) {
+
+    for (int i = 0; i < TEST_VEC_LEN; ++i) {
+      rand_dvalue[i] = drand48() - 0.5;
+      if (rand_dvalue[i] < 0.0) {
+        neg_chk_val += rand_dvalue[i];
+      } else {
+        pos_chk_val += rand_dvalue[i];
+      }
+    }
+    forall<cuda_exec<block_size> >(RangeSegment(0, TEST_VEC_LEN),
+                                   [=] RAJA_HOST_DEVICE(int i) {
+                                     if (rand_dvalue[i] < 0.0) {
+                                       dsumN += rand_dvalue[i];
+                                     } else {
+                                       dsumP += rand_dvalue[i];
+                                     }
+                                   });
+
+    ASSERT_FLOAT_EQ(dsumN.get(), neg_chk_val);
+    ASSERT_FLOAT_EQ(dsumP.get(), pos_chk_val);
+  }
+}
+
+GPU_TEST_F(ReduceSumCUDA, increasing_size)
+{
+  double* dvalue = ReduceSumCUDA::dvalue;
+
+  double dtinit = 5.0;
+
+  for (int size = block_size; size <= TEST_VEC_LEN; size += block_size) {
+
+    ReduceSum<cuda_reduce, double> dsum0(dtinit);
+
+    forall<cuda_exec<block_size, true> >(RangeSegment(0, size),
+                                         [=] RAJA_DEVICE(int i) {
+                                           dsum0 += dvalue[i];
+                                         });
+
+    double base_chk_val = dinit_val * double(size);
+
+    ASSERT_FLOAT_EQ(base_chk_val + dtinit, dsum0.get());
+  }
+}
diff --git a/test/unit/cuda/test-reduce-tupleloc.cpp b/test/old-tests/unit/cuda/test-reduce-tupleloc.cpp
similarity index 100%
rename from test/unit/cuda/test-reduce-tupleloc.cpp
rename to test/old-tests/unit/cuda/test-reduce-tupleloc.cpp
diff --git a/test/unit/cuda/test-synchronize.cpp b/test/old-tests/unit/cuda/test-synchronize.cpp
similarity index 100%
rename from test/unit/cuda/test-synchronize.cpp
rename to test/old-tests/unit/cuda/test-synchronize.cpp
diff --git a/test/unit/omp-target/CMakeLists.txt b/test/old-tests/unit/omp-target/CMakeLists.txt
similarity index 100%
rename from test/unit/omp-target/CMakeLists.txt
rename to test/old-tests/unit/omp-target/CMakeLists.txt
diff --git a/test/unit/omp-target/kernel.cpp b/test/old-tests/unit/omp-target/kernel.cpp
similarity index 100%
rename from test/unit/omp-target/kernel.cpp
rename to test/old-tests/unit/omp-target/kernel.cpp
diff --git a/test/unit/omp-target/test-nested-reduce.cpp b/test/old-tests/unit/omp-target/test-nested-reduce.cpp
similarity index 100%
rename from test/unit/omp-target/test-nested-reduce.cpp
rename to test/old-tests/unit/omp-target/test-nested-reduce.cpp
diff --git a/test/unit/omp-target/test-reduce-tuplemaxloc.cpp b/test/old-tests/unit/omp-target/test-reduce-tuplemaxloc.cpp
similarity index 100%
rename from test/unit/omp-target/test-reduce-tuplemaxloc.cpp
rename to test/old-tests/unit/omp-target/test-reduce-tuplemaxloc.cpp
diff --git a/test/unit/omp-target/test-reduce-tupleminloc.cpp b/test/old-tests/unit/omp-target/test-reduce-tupleminloc.cpp
similarity index 100%
rename from test/unit/omp-target/test-reduce-tupleminloc.cpp
rename to test/old-tests/unit/omp-target/test-reduce-tupleminloc.cpp
diff --git a/test/old-tests/unit/omp-target/test-reductions.cpp b/test/old-tests/unit/omp-target/test-reductions.cpp
new file mode 100644
index 0000000000..41d394e25b
--- /dev/null
+++ b/test/old-tests/unit/omp-target/test-reductions.cpp
@@ -0,0 +1,185 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA CPU reduction operations.
+///
+
+#include "gtest/gtest.h"
+
+#include <iostream>
+#include "RAJA/RAJA.hpp"
+
+#include <tuple>
+
+template <typename TUPLE>
+class ReductionCorrectnessTestTargetOMP : public ::testing::Test
+{
+protected:
+  virtual void SetUp()
+  {
+    array_length = 102;
+
+    array = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+                                                array_length * sizeof(double));
+
+    for (int i = 1; i < array_length - 1; ++i) {
+      array[i] = (RAJA::Real_type)i;
+    }
+    array[0] = 0.0;
+    array[array_length - 1] = -1.0;
+#pragma omp target enter data map(to : array[:array_length])
+
+    sum = 0.0;
+    min = array_length * 2;
+    max = 0.0;
+    minloc = -1;
+    maxloc = -1;
+
+    for (int i = 0; i < array_length; ++i) {
+      RAJA::Real_type val = array[i];
+
+      sum += val;
+
+      if (val > max) {
+        max = val;
+        maxloc = i;
+      }
+
+      if (val < min) {
+        min = val;
+        minloc = i;
+      }
+    }
+  }
+
+  virtual void TearDown()
+  {
+#pragma omp target exit data map(release : array[:array_length])
+    free(array);
+  }
+
+  RAJA::Real_ptr array;
+
+  RAJA::Real_type max;
+  RAJA::Real_type min;
+  RAJA::Real_type sum;
+  RAJA::Real_type maxloc;
+  RAJA::Real_type minloc;
+
+  RAJA::Index_type array_length;
+};
+TYPED_TEST_SUITE_P(ReductionCorrectnessTestTargetOMP);
+
+TYPED_TEST_P(ReductionCorrectnessTestTargetOMP, ReduceMinLocGenericIndex)
+{
+  using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
+  using ReducePolicy = typename std::tuple_element<1, TypeParam>::type;
+
+  struct Index {
+     RAJA::Index_type idx;
+     Index() : idx(-1) {}
+     Index(RAJA::Index_type idx) : idx(idx) {}
+  };
+
+  RAJA::ReduceMinLoc<ReducePolicy, double, Index> minloc_reducer(1024.0, Index(0));
+
+  auto array = this->array;
+  // TODO: remove this when compilers (clang-coral and IBM XLC) are no longer
+  // broken for lambda capture
+#pragma omp target data use_device_ptr(array)
+  RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
+                           [=](int i) { minloc_reducer.minloc(array[i], Index(i)); });
+
+  double raja_min = (double)minloc_reducer.get();
+  Index raja_loc = minloc_reducer.getLoc();
+
+  ASSERT_FLOAT_EQ(this->min, raja_min);
+  ASSERT_EQ(this->minloc, raja_loc.idx);
+}
+
+TYPED_TEST_P(ReductionCorrectnessTestTargetOMP, ReduceMaxLocGenericIndex)
+{
+  using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
+  using ReducePolicy = typename std::tuple_element<1, TypeParam>::type;
+
+  struct Index {
+     RAJA::Index_type idx;
+     Index() : idx(-1) {}
+     Index(RAJA::Index_type idx) : idx(idx) {}
+  };
+
+  RAJA::ReduceMaxLoc<ReducePolicy, double, Index> maxloc_reducer(0.0, Index());
+
+  auto array = this->array;
+  // TODO: remove this when compilers (clang-coral and IBM XLC) are no longer
+  // broken for lambda capture
+#pragma omp target data use_device_ptr(array)
+  RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
+                           [=](int i) { maxloc_reducer.maxloc(array[i], Index(i)); });
+
+  double raja_max = (double)maxloc_reducer.get();
+  Index raja_loc = maxloc_reducer.getLoc();
+
+  ASSERT_FLOAT_EQ(this->max, raja_max);
+  ASSERT_EQ(this->maxloc, raja_loc.idx);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ReductionCorrectnessTestTargetOMP,
+                            ReduceMinLocGenericIndex,
+                            ReduceMaxLocGenericIndex);
+using types =
+    ::testing::Types<std::tuple<RAJA::omp_target_parallel_for_exec<16>,
+                                RAJA::omp_target_reduce>,
+                     std::tuple<RAJA::omp_target_parallel_for_exec<64>,
+                                RAJA::omp_target_reduce>,
+                     std::tuple<RAJA::omp_target_parallel_for_exec<256>,
+                                RAJA::omp_target_reduce>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Reduce, ReductionCorrectnessTestTargetOMP, types);
+
+template <typename TUPLE>
+class NestedReductionCorrectnessTestTargetOMP : public ::testing::Test
+{
+protected:
+  virtual void SetUp()
+  {
+    x_size = 256;
+    y_size = 256;
+    z_size = 256;
+
+    array = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+                                                x_size * y_size * z_size *
+                                                    sizeof(double));
+
+    const double val = 4.0 / (x_size * y_size * z_size);
+
+    for (int i = 0; i < (x_size * y_size * z_size); ++i) {
+      array[i] = (RAJA::Real_type)val;
+    }
+
+#pragma omp target enter data map(to : array[:x_size * y_size * z_size])
+
+    sum = 4.0;
+  }
+
+  virtual void TearDown()
+  {
+#pragma omp target exit data map(release : array[:x_size * y_size * z_size])
+    free(array);
+  }
+
+  RAJA::Real_ptr array;
+
+  RAJA::Real_type sum;
+
+  RAJA::Index_type x_size;
+  RAJA::Index_type y_size;
+  RAJA::Index_type z_size;
+};
+
+
diff --git a/test/old-tests/unit/test-kernel-dynamic-tile.cpp b/test/old-tests/unit/test-kernel-dynamic-tile.cpp
new file mode 100644
index 0000000000..dbfc80433e
--- /dev/null
+++ b/test/old-tests/unit/test-kernel-dynamic-tile.cpp
@@ -0,0 +1,109 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA_gtest.hpp"
+
+TEST(KernelDynamicTile, Tile2D) {
+    
+  const int DIM_X = 7;
+  const int DIM_Y = 11;
+  const int TILE_X = 3;
+  const int TILE_Y = 5;
+
+  double* expected = new double[DIM_X * DIM_Y];
+  RAJA::View<double, RAJA::Layout<2> > expectedView(expected, DIM_Y, DIM_X);
+  double* actual = new double[DIM_X * DIM_Y];
+  RAJA::View<double, RAJA::Layout<2> > actualView(actual, DIM_Y, DIM_X);
+
+  using ExecPolicy = RAJA::KernelPolicy< 
+      RAJA::statement::Tile<1, RAJA::tile_dynamic<1>, RAJA::seq_exec,
+        RAJA::statement::Tile<0, RAJA::tile_dynamic<0>, RAJA::seq_exec,
+          RAJA::statement::For<1, RAJA::seq_exec, 
+            RAJA::statement::For<0, RAJA::seq_exec,
+              RAJA::statement::Lambda<0, RAJA::Segs<0, 1>, RAJA::Params<>>>>>>>;
+
+  int i = 0;
+  RAJA::kernel_param<ExecPolicy>(
+    RAJA::make_tuple(RAJA::RangeSegment{0,DIM_X}, RAJA::RangeSegment{0,DIM_Y}),
+    RAJA::make_tuple(RAJA::TileSize{TILE_X}, RAJA::TileSize{TILE_Y}),
+    [actualView, &i] (int x, int y) {
+      actualView(y, x) = i++;
+    });
+
+  i = 0;
+  for (int tile_y = 0; tile_y < DIM_Y; tile_y += TILE_Y) {
+    for (int tile_x = 0; tile_x < DIM_X; tile_x += TILE_X) {
+      for (int y = tile_y; y < std::min({tile_y + TILE_Y, DIM_Y}); ++y) {
+        for (int x = tile_x; x < std::min({tile_x + TILE_X, DIM_X}); ++x) {
+          expectedView(y, x) = i++;
+        }
+      }
+    }
+  }
+
+  for (int idx = 0; idx < DIM_X * DIM_Y; ++idx) {
+    ASSERT_EQ(actual[idx], expected[idx]) << "Vectors x and y differ at index " << idx;
+  }
+
+  delete[] expected;
+  delete[] actual;
+}
+
+TEST (KernelDynamicTile, Tile3D) { 
+
+  const int DIM_X = 7;
+  const int DIM_Y = 11;
+  const int DIM_Z = 13;
+  const int TILE_X = 3;
+  const int TILE_Y = 5;
+  const int TILE_Z = 7;
+
+  double* expected = new double[DIM_X * DIM_Y * DIM_Z];
+  RAJA::View<double, RAJA::Layout<3> > expectedView(expected, DIM_Z, DIM_Y, DIM_X);
+  double* actual = new double[DIM_X * DIM_Y * DIM_Z];
+  RAJA::View<double, RAJA::Layout<3> > actualView(actual, DIM_Z, DIM_Y, DIM_X);
+
+  using ExecPolicy = RAJA::KernelPolicy< 
+      RAJA::statement::Tile<2, RAJA::tile_dynamic<2>, RAJA::seq_exec,
+        RAJA::statement::Tile<1, RAJA::tile_dynamic<1>, RAJA::seq_exec,
+          RAJA::statement::Tile<0, RAJA::tile_dynamic<0>, RAJA::seq_exec,
+            RAJA::statement::For<2, RAJA::seq_exec, 
+              RAJA::statement::For<1, RAJA::seq_exec, 
+                RAJA::statement::For<0, RAJA::seq_exec,
+                  RAJA::statement::Lambda<0, RAJA::Segs<0, 1, 2>, RAJA::Params<>>>>>>>>>;
+
+  int i = 0;
+  RAJA::kernel_param<ExecPolicy>(
+    RAJA::make_tuple(RAJA::RangeSegment{0,DIM_X}, RAJA::RangeSegment{0,DIM_Y}, RAJA::RangeSegment{0,DIM_Z}),
+    RAJA::make_tuple(RAJA::TileSize{TILE_X}, RAJA::TileSize{TILE_Y}, RAJA::TileSize{TILE_Z}),
+    [actualView, &i] (int x, int y, int z) {
+      actualView(z, y, x) = ++i;
+    });
+
+  i = 0;
+  for (int tile_z = 0; tile_z < DIM_Z; tile_z += TILE_Z) {
+    for (int tile_y = 0; tile_y < DIM_Y; tile_y += TILE_Y) {
+      for (int tile_x = 0; tile_x < DIM_X; tile_x += TILE_X) {
+        for (int z = tile_z; z < std::min({tile_z + TILE_Z, DIM_Z}); ++z) {
+          for (int y = tile_y; y < std::min({tile_y + TILE_Y, DIM_Y}); ++y) {
+            for (int x = tile_x; x < std::min({tile_x + TILE_X, DIM_X}); ++x) {
+              expectedView(z, y, x) = ++i;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for (int idx = 0; idx < DIM_X * DIM_Y * DIM_Z; ++idx) {
+    ASSERT_EQ(actual[idx], expected[idx]) << "Vectors x and y differ at index " << idx;
+  }
+
+  delete[] expected;
+  delete[] actual;
+}
\ No newline at end of file
diff --git a/test/unit/test-kernel-lambda-args.cpp b/test/old-tests/unit/test-kernel-lambda-args.cpp
similarity index 79%
rename from test/unit/test-kernel-lambda-args.cpp
rename to test/old-tests/unit/test-kernel-lambda-args.cpp
index 0524a5071d..ae4594d2c4 100644
--- a/test/unit/test-kernel-lambda-args.cpp
+++ b/test/old-tests/unit/test-kernel-lambda-args.cpp
@@ -52,7 +52,6 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
   const int DIM = 2;
   const int N_rows = 144;
   const int N_cols = 255;
-  const int TILE_DIM = 16;
 
 
   double *A, *At, *B, *Bt;
@@ -109,8 +108,8 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
   //Check result
   for (int row = 0; row < N_rows; ++row) {
     for (int col = 0; col < N_cols; ++col) {
-      ASSERT_FLOAT_EQ(Atview(col,row), col);
-      ASSERT_FLOAT_EQ(Btview(col,row), col);
+      ASSERT_FLOAT_EQ(Atview(col,row), (double)col);
+      ASSERT_FLOAT_EQ(Btview(col,row), (double)col);
     }
   }
 
@@ -148,7 +147,6 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
   const int DIM = 2;
   const int N_rows = 144;
   const int N_cols = 255;
-  const int TILE_DIM = 16;
 
 
   double *A  = new double[N_rows * N_cols];
@@ -234,8 +232,8 @@ using SeqTypes =
   ::testing::Types<
   RAJA::list<
     RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::loop_exec,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
 
           RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
 
@@ -267,8 +265,8 @@ using TestTypes =
   ::testing::Types<
   RAJA::list<
     RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::loop_exec,
-       RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+       RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
 
           RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
 
@@ -290,8 +288,8 @@ using TestTypes =
     >, //close list
   RAJA::list<
     RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::loop_exec,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
 
           RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
 
@@ -316,8 +314,8 @@ using TestTypes =
     > //close list
   ,RAJA::list<
     RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::omp_parallel_for_exec,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::omp_parallel_for_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
 
           RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
 
@@ -352,8 +350,8 @@ using CUDATypes =
   RAJA::list<
     RAJA::KernelPolicy<
       RAJA::statement::CudaKernel<
-        RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
 
             RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<0,1>,
 
@@ -365,19 +363,49 @@ using CUDATypes =
                >,
               RAJA::statement::CudaSyncThreads,
 
-                //Read data from shared memory
-                RAJA::statement::For<0, RAJA::cuda_thread_y_direct,
-                  RAJA::statement::For<1, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<1, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0,1> >
-                  >
-                 >,
-                RAJA::statement::CudaSyncThreads
-              > //close shared memory scope
-            >//for 2
-          >//for 3
-        > //CudaKernel
-      > //kernel policy
-    > //list
+              //Read data from shared memory
+              RAJA::statement::For<0, RAJA::cuda_thread_y_direct,
+                RAJA::statement::For<1, RAJA::cuda_thread_x_direct,
+                RAJA::statement::Lambda<1, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0,1> >
+                >
+               >,
+              RAJA::statement::CudaSyncThreads
+            > //close shared memory scope
+          >//for 2
+        >//for 3
+      > //CudaKernel
+    > //kernel policy
+  > //list
+  ,
+  RAJA::list<
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_direct,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_direct,
+
+            RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<0,1>,
+
+              //Load data into shared memory
+              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
+                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
+                  RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0,1> >
+                 >
+               >,
+              RAJA::statement::CudaSyncThreads,
+
+              //Read data from shared memory
+              RAJA::statement::For<0, RAJA::cuda_thread_y_direct,
+                RAJA::statement::For<1, RAJA::cuda_thread_x_direct,
+                RAJA::statement::Lambda<1, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0,1> >
+                >
+               >,
+              RAJA::statement::CudaSyncThreads
+            > //close shared memory scope
+          >//for 2
+        >//for 3
+      > //CudaKernel
+    > //kernel policy
+  > //list
   >; //types
 INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, MatTranspose, CUDATypes);
 #endif
@@ -388,8 +416,8 @@ using HIPTypes =
   RAJA::list<
     RAJA::KernelPolicy<
       RAJA::statement::HipKernel<
-        RAJA::statement::Tile<1, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
-        RAJA::statement::Tile<0, RAJA::statement::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_direct,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_direct,
 
             RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<0,1>,
 
@@ -397,23 +425,53 @@ using HIPTypes =
               RAJA::statement::For<1, RAJA::hip_thread_y_direct,
                 RAJA::statement::For<0, RAJA::hip_thread_x_direct,
                   RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0,1> >
-                 >
-               >,
+                >
+              >,
               RAJA::statement::HipSyncThreads,
 
-                //Read data from shared memory
-                RAJA::statement::For<0, RAJA::hip_thread_y_direct,
-                  RAJA::statement::For<1, RAJA::hip_thread_x_direct,
+              //Read data from shared memory
+              RAJA::statement::For<0, RAJA::hip_thread_y_direct,
+                RAJA::statement::For<1, RAJA::hip_thread_x_direct,
                   RAJA::statement::Lambda<1, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0,1> >
-                  >
-                 >,
-                RAJA::statement::HipSyncThreads
-              > //close shared memory scope
-            >//for 2
-          >//for 3
-        > //HipKernel
-      > //kernel policy
-    > //list
+                >
+              >,
+              RAJA::statement::HipSyncThreads
+            > //close shared memory scope
+          >//for 2
+        >//for 3
+      > //HipKernel
+    > //kernel policy
+  > //list
+  ,
+  RAJA::list<
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
+
+            RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<0,1>,
+
+              //Load data into shared memory
+              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
+                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+                  RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0,1> >
+                >
+              >,
+              RAJA::statement::HipSyncThreads,
+
+              //Read data from shared memory
+              RAJA::statement::For<0, RAJA::hip_thread_y_direct,
+                RAJA::statement::For<1, RAJA::hip_thread_x_direct,
+                  RAJA::statement::Lambda<1, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0,1> >
+                >
+              >,
+              RAJA::statement::HipSyncThreads
+            > //close shared memory scope
+          >//for 2
+        >//for 3
+      > //HipKernel
+    > //kernel policy
+  > //list
   >; //types
 INSTANTIATE_TYPED_TEST_SUITE_P(HIP, MatTranspose_gpu, HIPTypes);
 #endif
@@ -484,7 +542,7 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
 
   for(int r=0; r<N; ++r){
     for(int c=0; c<P; ++c){
-      int dot = 0.0;
+      double dot = 0.0;
       for(int k=0; k<M; ++k){
         dot += Aview(r,k)*Bview(k,c);
       }
@@ -851,10 +909,10 @@ using CudaTypes2 =
               RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
                 RAJA::statement::Lambda<0, Segs<0,1>, Params<2> >
               >
-             >,
+            >,
 
             //Slide window across matrix
-             RAJA::statement::For<2, RAJA::seq_exec,
+            RAJA::statement::For<2, RAJA::seq_exec,
 
               //Load matrix into tile
               RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
@@ -877,13 +935,13 @@ using CudaTypes2 =
               RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
                 RAJA::statement::Lambda<3, Segs<0,1, 3, 4>, Params<2> >
               >
-             >
-         > //Create shared memory
+            >
+          > //Create shared memory
         >//For 3
-       >//For 4
-        > //CudaKernel
-      > //close kernel policy
-    > //close list
+      >//For 4
+      > //CudaKernel
+    > //close kernel policy
+  > //close list
   ,
   RAJA::list<
     RAJA::SizeList<TILE_DIM, TILE_DIM>,
@@ -898,22 +956,22 @@ using CudaTypes2 =
               RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
                 RAJA::statement::Lambda<0, Segs<0,1>, Params<2> >
               >
-             >,
+            >,
 
             //Slide window across matrix
-             RAJA::statement::For<2, RAJA::seq_exec,
+            RAJA::statement::For<2, RAJA::seq_exec,
 
               //Load matrix into tile
               RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
                 RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                   RAJA::statement::Lambda<1, Segs<0,1,2,3,4>, Params<0,1> >
+                  RAJA::statement::Lambda<1, Segs<0,1,2,3,4>, Params<0,1> >
                 >
               >,
-              //perform matrix multiplcation
+              //perform matrix multiplication
               RAJA::statement::CudaSyncThreads,
-                RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                  RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                   RAJA::statement::Lambda<2, Segs<0,1>, Params<0,1,2> >
+              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
+                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
+                  RAJA::statement::Lambda<2, Segs<0,1>, Params<0,1,2> >
                 >
               >,
               RAJA::statement::CudaSyncThreads
@@ -924,13 +982,13 @@ using CudaTypes2 =
               RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
                 RAJA::statement::Lambda<3, Segs<0,1, 3, 4>, Params<2> >
               >
-             >
-         > //Create shared memory
+            >
+          > //Create shared memory
         >//For 3
-       >//For 4
-        > //CudaKernel
-      > //close kernel policy
-    > //close list
+      >//For 4
+      > //CudaKernel
+    > //close kernel policy
+  > //close list
   >;//close types
 
 INSTANTIATE_TYPED_TEST_SUITE_P(CUDAShmem, MatMultiply, CudaTypes2);
@@ -939,6 +997,54 @@ INSTANTIATE_TYPED_TEST_SUITE_P(CUDAShmem, MatMultiply, CudaTypes2);
 #if defined(RAJA_ENABLE_HIP)
 using HipTypes2 =
   ::testing::Types<
+
+  RAJA::list<
+    RAJA::SizeList<TILE_DIM, TILE_DIM>,
+    RAJA::SizeList<TILE_DIM, TILE_DIM>,
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+      RAJA::statement::For<4, RAJA::hip_block_y_direct,
+        RAJA::statement::For<3, RAJA::hip_block_x_direct,
+          RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<2,1,0>,
+            //Initalize thread private value
+            RAJA::statement::For<1, RAJA::hip_thread_y_direct,
+              RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+                RAJA::statement::Lambda<0, Segs<0,1>, Params<2> >
+              >
+            >,
+
+            //Slide window across matrix
+             RAJA::statement::For<2, RAJA::seq_exec,
+
+              //Load matrix into tile
+              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
+                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+                  RAJA::statement::Lambda<1, Segs<0,1,2,3,4>, Params<0,1> >
+                >
+              >,
+              //perform matrix multiplcation
+              RAJA::statement::HipSyncThreads,
+              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
+                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+                  RAJA::statement::Lambda<2, Segs<0,1>, Params<0,1,2> >
+                >
+              >,
+              RAJA::statement::HipSyncThreads
+            >, //sliding window
+
+            //Write memory out to global matrix
+            RAJA::statement::For<1, RAJA::hip_thread_y_direct,
+              RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+                RAJA::statement::Lambda<3, Segs<0,1, 3, 4>, Params<2> >
+              >
+            >
+          > //Create shared memory
+        >//For 3
+      >//For 4
+      > //HipKernel
+    > //close kernel policy
+  > //close list
+  ,
   RAJA::list<
     RAJA::SizeList<TILE_DIM, TILE_DIM>,
     RAJA::SizeList<TILE_DIM, TILE_DIM>,
@@ -952,7 +1058,7 @@ using HipTypes2 =
               RAJA::statement::For<0, RAJA::hip_thread_x_direct,
                 RAJA::statement::Lambda<0, Segs<0,1>, Params<2> >
               >
-             >,
+            >,
 
             //Slide window across matrix
              RAJA::statement::For<2, RAJA::seq_exec,
@@ -960,14 +1066,14 @@ using HipTypes2 =
               //Load matrix into tile
               RAJA::statement::For<1, RAJA::hip_thread_y_direct,
                 RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                   RAJA::statement::Lambda<1, Segs<0,1,2,3,4>, Params<0,1> >
+                  RAJA::statement::Lambda<1, Segs<0,1,2,3,4>, Params<0,1> >
                 >
               >,
               //perform matrix multiplcation
               RAJA::statement::HipSyncThreads,
-                RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                  RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                   RAJA::statement::Lambda<2, Segs<0,1>, Params<0,1,2> >
+              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
+                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+                  RAJA::statement::Lambda<2, Segs<0,1>, Params<0,1,2> >
                 >
               >,
               RAJA::statement::HipSyncThreads
@@ -978,13 +1084,13 @@ using HipTypes2 =
               RAJA::statement::For<0, RAJA::hip_thread_x_direct,
                 RAJA::statement::Lambda<3, Segs<0,1, 3, 4>, Params<2> >
               >
-             >
-         > //Create shared memory
+            >
+          > //Create shared memory
         >//For 3
-       >//For 4
-        > //HipKernel
-      > //close kernel policy
-    > //close list
+      >//For 4
+      > //HipKernel
+    > //close kernel policy
+  > //close list
   >;//close types
 
 INSTANTIATE_TYPED_TEST_SUITE_P(HIPShmem, MatMultiply_gpu, HipTypes2);
@@ -1061,7 +1167,7 @@ GPU_TYPED_TEST_P(MatMult3, Basic)
   //Check result
   for (int row = 0; row < N; ++row) {
     for (int col = 0; col < N; ++col) {
-      ASSERT_FLOAT_EQ(Cview(row,col),(row*col*N));
+      ASSERT_FLOAT_EQ(Cview(row,col),(double)(row*col*N));
     }
   }
 
@@ -1215,8 +1321,8 @@ using CudaTypesMult3 =
   RAJA::list<
     RAJA::KernelPolicy<
       RAJA::statement::CudaKernel<
-        RAJA::statement::Tile<1, RAJA::statement::tile_fixed<16>, RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::statement::tile_fixed<16>, RAJA::cuda_block_x_loop,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<16>, RAJA::cuda_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<16>, RAJA::cuda_block_x_loop,
             RAJA::statement::For<1, RAJA::cuda_thread_y_loop, // row
               RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // col
                 RAJA::statement::Lambda<0, Params<0>>,  // dot = 0.0
@@ -1230,7 +1336,27 @@ using CudaTypesMult3 =
         >
       >
     >
-    >//close list
+  >//close list
+  ,
+  RAJA::list<
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::Tile<1, RAJA::tile_fixed<16>, RAJA::cuda_block_y_direct,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<16>, RAJA::cuda_block_x_direct,
+            RAJA::statement::For<1, RAJA::cuda_thread_y_loop, // row
+              RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // col
+                RAJA::statement::Lambda<0, Params<0>>,  // dot = 0.0
+                RAJA::statement::For<2, RAJA::seq_exec,
+                  RAJA::statement::Lambda<1, Segs<0,1,2>, Params<0>> // dot += ...
+                >,
+                  RAJA::statement::Lambda<2, Segs<0,1>, Params<0>>   // set C = ...
+              >
+            >
+          >
+        >
+      >
+    >
+  >//close list
   >;//close types
 
 INSTANTIATE_TYPED_TEST_SUITE_P(Cuda, MatMult3, CudaTypesMult3);
@@ -1242,22 +1368,42 @@ using HipTypesMult3 =
   RAJA::list<
     RAJA::KernelPolicy<
       RAJA::statement::HipKernel<
-        RAJA::statement::Tile<1, RAJA::statement::tile_fixed<16>, RAJA::hip_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::statement::tile_fixed<16>, RAJA::hip_block_x_loop,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<16>, RAJA::hip_block_y_direct,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<16>, RAJA::hip_block_x_direct,
             RAJA::statement::For<1, RAJA::hip_thread_y_loop, // row
               RAJA::statement::For<0, RAJA::hip_thread_x_loop, // col
                 RAJA::statement::Lambda<0, Params<0>>,  // dot = 0.0
                 RAJA::statement::For<2, RAJA::seq_exec,
                   RAJA::statement::Lambda<1, Segs<0,1,2>, Params<0>> // dot += ...
                 >,
-                  RAJA::statement::Lambda<2, Segs<0,1>, Params<0>>   // set C = ...
+                RAJA::statement::Lambda<2, Segs<0,1>, Params<0>>   // set C = ...
+              >
+            >
+          >
+        >
+      >
+    >
+  >//close list
+  ,
+  RAJA::list<
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::Tile<1, RAJA::tile_fixed<16>, RAJA::hip_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<16>, RAJA::hip_block_x_loop,
+            RAJA::statement::For<1, RAJA::hip_thread_y_loop, // row
+              RAJA::statement::For<0, RAJA::hip_thread_x_loop, // col
+                RAJA::statement::Lambda<0, Params<0>>,  // dot = 0.0
+                RAJA::statement::For<2, RAJA::seq_exec,
+                  RAJA::statement::Lambda<1, Segs<0,1,2>, Params<0>> // dot += ...
+                >,
+                RAJA::statement::Lambda<2, Segs<0,1>, Params<0>>   // set C = ...
               >
             >
           >
         >
       >
     >
-    >//close list
+  >//close list
   >;//close types
 
 INSTANTIATE_TYPED_TEST_SUITE_P(Hip, MatMult3_gpu, HipTypesMult3);
diff --git a/test/unit/test-kernel.cpp b/test/old-tests/unit/test-kernel.cpp
similarity index 90%
rename from test/unit/test-kernel.cpp
rename to test/old-tests/unit/test-kernel.cpp
index d0f705c24b..0124862af2 100644
--- a/test/unit/test-kernel.cpp
+++ b/test/old-tests/unit/test-kernel.cpp
@@ -8,6 +8,8 @@
 #include "RAJA/RAJA.hpp"
 #include "RAJA_gtest.hpp"
 
+#include "camp/resource.hpp"
+
 #include <cstdio>
 
 #if defined(RAJA_ENABLE_CUDA)
@@ -102,12 +104,13 @@ RAJA_HOST_DEVICE constexpr Index_type get_val(T v) noexcept
 GPU_TYPED_TEST_P(Kernel, Basic)
 {
   using Pol = at_v<TypeParam, 0>;
-  using IndexTypes = at_v<TypeParam, 1>;
+  using Res = at_v<TypeParam, 1>;
+  using IndexTypes = at_v<TypeParam, 2>;
   using Idx0 = at_v<IndexTypes, 0>;
   using Idx1 = at_v<IndexTypes, 1>;
-  RAJA::ReduceSum<at_v<TypeParam, 2>, RAJA::Real_type> tsum(0.0);
-  RAJA::ReduceMin<at_v<TypeParam, 2>, RAJA::Real_type> tMin(0.0);
-  RAJA::ReduceMax<at_v<TypeParam, 2>, RAJA::Real_type> tMax(0.0);
+  RAJA::ReduceSum<at_v<TypeParam, 3>, RAJA::Real_type> tsum(0.0);
+  RAJA::ReduceMin<at_v<TypeParam, 3>, RAJA::Real_type> tMin(0.0);
+  RAJA::ReduceMax<at_v<TypeParam, 3>, RAJA::Real_type> tMax(0.0);
   RAJA::Real_type total{0.0};
   auto ranges = RAJA::make_tuple(RAJA::TypedRangeSegment<Idx0>(0, x_len),
                                  RAJA::TypedRangeSegment<Idx1>(0, y_len));
@@ -123,8 +126,9 @@ GPU_TYPED_TEST_P(Kernel, Basic)
       total += i * 1.1 + j;
     }
   }
-  ASSERT_FLOAT_EQ(total, tsum.get());
+  ASSERT_FLOAT_EQ((double)total, (double)tsum.get());
 
+  camp::resources::Resource work_res{Res()};
 
   // Check reduction
   int stride1 = 5;
@@ -173,9 +177,9 @@ GPU_TYPED_TEST_P(Kernel, Basic)
     tMax.max(arr[id]);
   });
 
-  ASSERT_FLOAT_EQ(total, tsum.get());
-  ASSERT_FLOAT_EQ(-1, tMin.get());
-  ASSERT_FLOAT_EQ(50, tMax.get());
+  ASSERT_FLOAT_EQ((double)total, (double)tsum.get());
+  ASSERT_FLOAT_EQ((double)-1, (double)tMin.get());
+  ASSERT_FLOAT_EQ((double)50, (double)tMax.get());
 
   std::vector<Idx0> idx_x;
   std::vector<Idx1> idx_y;
@@ -187,8 +191,8 @@ GPU_TYPED_TEST_P(Kernel, Basic)
 
   tsum.reset(0.0);
   total = 0.0;
-  RAJA::TypedListSegment<Idx0> idx_list(&idx_x[0], idx_x.size());
-  RAJA::TypedListSegment<Idx1> idy_list(&idx_y[0], idx_y.size());
+  RAJA::TypedListSegment<Idx0> idx_list(&idx_x[0], idx_x.size(), work_res);
+  RAJA::TypedListSegment<Idx1> idy_list(&idx_y[0], idx_y.size(), work_res);
   auto rangeList = RAJA::make_tuple(idx_list, idy_list);
 
   RAJA::kernel<Pol>(rangeList, [=] RAJA_HOST_DEVICE(Idx0 i, Idx1 j) {
@@ -204,7 +208,7 @@ GPU_TYPED_TEST_P(Kernel, Basic)
       total += i * 1.1 + j;
     }
   }
-  ASSERT_FLOAT_EQ(total, tsum.get());
+  ASSERT_FLOAT_EQ((double)total, (double)tsum.get());
 
   total = 0.0;
   tsum.reset(0.0);
@@ -232,28 +236,9 @@ GPU_TYPED_TEST_P(Kernel, Basic)
       total += i * 1.1 + j;
     }
   }
-  ASSERT_FLOAT_EQ(total, tsum.get());
+  ASSERT_FLOAT_EQ((double)total, (double)tsum.get());
 
 
-  total = 0.0;
-  tsum.reset(0.0);
-  auto iterSpace3 = RAJA::make_tuple(RAJA::TypedRangeSegment<Idx0>(0, x_len),
-                                     idy_list,
-                                     RAJA::TypedRangeSegment<Idx1>(0, 10));
-  RAJA::kernel<Pol>(iterSpace3, [=] RAJA_HOST_DEVICE(Idx0 i, Idx1 j, Idx1 k) {
-    Index_type id = get_val(i) * x_len + get_val(j);
-    idx_test[id] = get_val(i) * x_len + get_val(j) + get_val(k) - get_val(k);
-    tsum += get_val(i) * 1.1 + get_val(j);
-  });
-
-  for (Index_type i = 0; i < x_len; ++i) {
-    for (Index_type j = 0; j < y_len; ++j) {
-      ASSERT_EQ(idx_test[i * x_len + j], i * x_len + j);
-      total += i * 1.1 + j;
-    }
-  }
-
-  ASSERT_FLOAT_EQ(total, tsum.get());
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaFree(arr));
@@ -269,18 +254,20 @@ REGISTER_TYPED_TEST_SUITE_P(Kernel, Basic);
 #ifdef RAJA_ENABLE_HIP
 GPU_TYPED_TEST_P(Kernel_gpu, Basic)
 {
-  using Pol = at_v<TypeParam, 0>;
-  using IndexTypes = at_v<TypeParam, 1>;
+  using Pol2d = at_v<TypeParam, 0>;
+  using Pol3d = at_v<TypeParam, 1>;
+  using Res = at_v<TypeParam, 2>;
+  using IndexTypes = at_v<TypeParam, 3>;
   using Idx0 = at_v<IndexTypes, 0>;
   using Idx1 = at_v<IndexTypes, 1>;
-  RAJA::ReduceSum<at_v<TypeParam, 2>, RAJA::Real_type> tsum(0.0);
-  RAJA::ReduceMin<at_v<TypeParam, 2>, RAJA::Real_type> tMin(0.0);
-  RAJA::ReduceMax<at_v<TypeParam, 2>, RAJA::Real_type> tMax(0.0);
+  RAJA::ReduceSum<at_v<TypeParam, 4>, RAJA::Real_type> tsum(0.0);
+  RAJA::ReduceMin<at_v<TypeParam, 4>, RAJA::Real_type> tMin(0.0);
+  RAJA::ReduceMax<at_v<TypeParam, 4>, RAJA::Real_type> tMax(0.0);
   RAJA::Real_type total{0.0};
   auto ranges = RAJA::make_tuple(RAJA::TypedRangeSegment<Idx0>(0, x_len),
                                  RAJA::TypedRangeSegment<Idx1>(0, y_len));
   auto v = this->d_view;
-  RAJA::kernel<Pol>(ranges, [=] RAJA_HOST_DEVICE(Idx0 i, Idx1 j) {
+  RAJA::kernel<Pol2d>(ranges, [=] RAJA_HOST_DEVICE(Idx0 i, Idx1 j) {
     // std::cerr << "i: " << get_val(i) << " j: " << j << std::endl;
     v(get_val(i), j) = get_val(i) * x_len + j;
     tsum += get_val(i) * 1.1 + j;
@@ -298,6 +285,8 @@ GPU_TYPED_TEST_P(Kernel_gpu, Basic)
   }
   ASSERT_FLOAT_EQ(total, tsum.get());
 
+  camp::resources::Resource work_res{Res()};
+
   //Check reduction
   int stride1 = 5;
   int arr_len = stride1*stride1;
@@ -320,12 +309,12 @@ GPU_TYPED_TEST_P(Kernel_gpu, Basic)
   auto ranges2 = RAJA::make_tuple(RAJA::TypedRangeSegment<Idx0>(0, stride1),
                                   RAJA::TypedRangeSegment<Idx1>(0, stride1));
 
-  RAJA::kernel<Pol>(ranges, [=] RAJA_HOST_DEVICE(Idx0 i, Idx1 j) {
+  RAJA::kernel<Pol2d>(ranges, [=] RAJA_HOST_DEVICE(Idx0 i, Idx1 j) {
       // std::cerr << "i: " << get_val(i) << " j: " << j << std::endl;
       tsum += get_val(i) * 1.1 + get_val(j);
   });
 
-  RAJA::kernel<Pol>(ranges2, [=] RAJA_HOST_DEVICE(Idx0 i, Idx1 j) {
+  RAJA::kernel<Pol2d>(ranges2, [=] RAJA_HOST_DEVICE(Idx0 i, Idx1 j) {
       // std::cerr << "i: " << get_val(i) << " j: " << j << std::endl;
       RAJA::Index_type id = get_val(j) + get_val(i) * stride1;
       tMin.min(d_arr[id]);
@@ -335,7 +324,7 @@ GPU_TYPED_TEST_P(Kernel_gpu, Basic)
   tMin.reset(0.0);
   tMax.reset(0.0);
 
-  RAJA::kernel<Pol>(ranges2, [=] RAJA_HOST_DEVICE(Idx0 i, Idx1 j) {
+  RAJA::kernel<Pol2d>(ranges2, [=] RAJA_HOST_DEVICE(Idx0 i, Idx1 j) {
       // std::cerr << "i: " << get_val(i) << " j: " << j << std::endl;
       RAJA::Index_type id = get_val(j) + get_val(i) * stride1;
       tMin.min(d_arr[id]);
@@ -355,12 +344,12 @@ GPU_TYPED_TEST_P(Kernel_gpu, Basic)
 
   tsum.reset(0.0);
   total = 0.0;
-  RAJA::TypedListSegment<Idx0> idx_list(&idx_x[0], idx_x.size());
-  RAJA::TypedListSegment<Idx1> idy_list(&idx_y[0], idx_y.size());
+  RAJA::TypedListSegment<Idx0> idx_list(&idx_x[0], idx_x.size(), work_res);
+  RAJA::TypedListSegment<Idx1> idy_list(&idx_y[0], idx_y.size(), work_res);
 
   auto rangeList = RAJA::make_tuple(idx_list, idy_list);
 
-  RAJA::kernel<Pol>(rangeList, [=] RAJA_HOST_DEVICE(Idx0 i, Idx1 j) {
+  RAJA::kernel<Pol2d>(rangeList, [=] RAJA_HOST_DEVICE(Idx0 i, Idx1 j) {
     // std::cerr << "i: " << get_val(i) << " j: " << j << std::endl;
       v(get_val(i), j) = get_val(i) * x_len + j;
       tsum += get_val(i) * 1.1 + j;
@@ -382,7 +371,7 @@ GPU_TYPED_TEST_P(Kernel_gpu, Basic)
   hipErrchk(hipMalloc(&d_idx_test, sizeof(double) * x_len * y_len));
 
   auto iterSpace2 = RAJA::make_tuple(RAJA::TypedRangeSegment<Idx0>(0,x_len), idy_list);
-  RAJA::kernel<Pol>(iterSpace2, [=] RAJA_HOST_DEVICE (Idx0 i, Idx1 j) {
+  RAJA::kernel<Pol2d>(iterSpace2, [=] RAJA_HOST_DEVICE (Idx0 i, Idx1 j) {
       Index_type id = get_val(i)*x_len + get_val(j);
       d_idx_test[id] = get_val(i) * x_len + get_val(j);
       tsum += get_val(i) * 1.1 + get_val(j);
@@ -401,7 +390,7 @@ GPU_TYPED_TEST_P(Kernel_gpu, Basic)
   total=0.0;
   tsum.reset(0.0);
   auto iterSpace3 = RAJA::make_tuple(RAJA::TypedRangeSegment<Idx0>(0,x_len), idy_list,RAJA::TypedRangeSegment<Idx1>(0,10));
-  RAJA::kernel<Pol>(iterSpace3, [=] RAJA_HOST_DEVICE (Idx0 i, Idx1 j, Idx1 k) {
+  RAJA::kernel<Pol3d>(iterSpace3, [=] RAJA_HOST_DEVICE (Idx0 i, Idx1 j, Idx1 k) {
       Index_type id = get_val(i)*x_len + get_val(j);
       d_idx_test[id] = get_val(i) * x_len + get_val(j) + get_val(k) - get_val(k);
     tsum += get_val(i) * 1.1 + get_val(j);
@@ -429,19 +418,22 @@ using RAJA::list;
 using s = RAJA::seq_exec;
 using TestTypes = ::testing::Types<
     list<KernelPolicy<For<1, s, statement::For<0, s, Lambda<0>>>>,
+         camp::resources::Host,
          list<TypedIndex, Index_type>,
          RAJA::seq_reduce>,
     list<KernelPolicy<
              statement::Tile<1,
-                             statement::tile_fixed<2>,
+                             tile_fixed<2>,
                              RAJA::loop_exec,
                              statement::Tile<0,
-                                             statement::tile_fixed<2>,
+                                             tile_fixed<2>,
                                              RAJA::loop_exec,
                                              For<0, s, For<1, s, Lambda<0>>>>>>,
+         camp::resources::Host,
          list<Index_type, Index_type>,
          RAJA::seq_reduce>,
     list<KernelPolicy<statement::Collapse<s, ArgList<0, 1>, Lambda<0>>>,
+         camp::resources::Host,
          list<Index_type, Index_type>,
          RAJA::seq_reduce>>;
 
@@ -452,13 +444,15 @@ INSTANTIATE_TYPED_TEST_SUITE_P(Sequential, Kernel, TestTypes);
 using OMPTypes = ::testing::Types<
     list<
         KernelPolicy<For<1, RAJA::omp_parallel_for_exec, For<0, s, Lambda<0>>>>,
+        camp::resources::Host,
         list<TypedIndex, Index_type>,
         RAJA::omp_reduce>,
     list<KernelPolicy<
              statement::Tile<1,
-                             statement::tile_fixed<2>,
+                             tile_fixed<2>,
                              RAJA::omp_parallel_for_exec,
                              For<1, RAJA::loop_exec, For<0, s, Lambda<0>>>>>,
+         camp::resources::Host,
          list<TypedIndex, Index_type>,
          RAJA::omp_reduce>>;
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMP, Kernel, OMPTypes);
@@ -466,6 +460,7 @@ INSTANTIATE_TYPED_TEST_SUITE_P(OpenMP, Kernel, OMPTypes);
 #if defined(RAJA_ENABLE_TBB)
 using TBBTypes = ::testing::Types<
     list<KernelPolicy<For<1, RAJA::tbb_for_exec, For<0, s, Lambda<0>>>>,
+         camp::resources::Host,
          list<TypedIndex, Index_type>,
          RAJA::tbb_reduce>>;
 INSTANTIATE_TYPED_TEST_SUITE_P(TBB, Kernel, TBBTypes);
@@ -476,16 +471,32 @@ using CUDATypes = ::testing::Types<
              1,
              s,
              CudaKernel<For<0, RAJA::cuda_thread_x_loop, Lambda<0>>>>>,
+         camp::resources::Cuda,
          list<TypedIndex, Index_type>,
          RAJA::cuda_reduce>>;
 INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, Kernel, CUDATypes);
 #endif
 #if defined(RAJA_ENABLE_HIP)
 using HIPTypes = ::testing::Types<
-    list<KernelPolicy<For<
-             1,
-             s,
-             HipKernel<For<0, RAJA::hip_thread_x_loop, Lambda<0>>>>>,
+    list<KernelPolicy<
+           For<1, s,
+             HipKernel<
+              For<0, RAJA::hip_thread_x_loop, 
+                Lambda<0>
+              >
+            >
+          >
+        >,
+        KernelPolicy<
+           For<1, s,
+             HipKernel<               
+                 For<0, RAJA::hip_thread_x_loop, 
+                   Lambda<0, Segs<0, 1>, ValuesT<int, 0>>
+                 >               
+             >
+           >
+         >,
+         camp::resources::Hip,
          list<TypedIndex, Index_type>,
          RAJA::hip_reduce>>;
 INSTANTIATE_TYPED_TEST_SUITE_P(HIP, Kernel_gpu, HIPTypes);
@@ -541,7 +552,7 @@ GPU_TEST(Kernel, CudaCollapse2)
 {
   using Pol = RAJA::KernelPolicy<
       CudaKernel<
-        statement::Tile<0, statement::tile_fixed<32>, cuda_block_x_loop,
+        statement::Tile<0, tile_fixed<32>, cuda_block_x_loop,
           For<0, cuda_thread_x_loop,
             For<1, cuda_thread_y_loop,
               Lambda<0>
@@ -681,14 +692,14 @@ GPU_TEST(Kernel, SubRange_Complex)
 {
   using PolA = RAJA::KernelPolicy<
       CudaKernel<
-      statement::Tile<0, statement::tile_fixed<128>, cuda_block_x_loop,
+      statement::Tile<0, tile_fixed<128>, cuda_block_x_loop,
         For<0, RAJA::cuda_thread_x_loop,
           Lambda<0>>>>>;
 
   using PolB = RAJA::KernelPolicy<
       CudaKernel<
-      statement::Tile<0, statement::tile_fixed<32>, cuda_block_x_loop,
-        statement::Tile<1, statement::tile_fixed<32>, cuda_block_y_loop,
+      statement::Tile<0, tile_fixed<32>, cuda_block_x_loop,
+        statement::Tile<1, tile_fixed<32>, cuda_block_y_loop,
           For<0, cuda_thread_x_direct,
             For<1, cuda_thread_y_direct,
               For<2, RAJA::seq_exec, Lambda<0>>>>>>>>;
@@ -784,7 +795,7 @@ GPU_TEST(Kernel_gpu, HipCollapse2)
 {
   using Pol = RAJA::KernelPolicy<
       HipKernel<
-        statement::Tile<0, statement::tile_fixed<32>, hip_block_x_loop,
+        statement::Tile<0, tile_fixed<32>, hip_block_x_loop,
           For<0, hip_thread_x_loop,
             For<1, hip_thread_y_loop,
               Lambda<0>
@@ -935,14 +946,14 @@ GPU_TEST(Kernel_gpu, SubRange_Complex)
 {
   using PolA = RAJA::KernelPolicy<
       HipKernel<
-      statement::Tile<0, statement::tile_fixed<128>, hip_block_x_loop,
+      statement::Tile<0, tile_fixed<128>, hip_block_x_loop,
         For<0, RAJA::hip_thread_x_loop,
           Lambda<0>>>>>;
 
   using PolB = RAJA::KernelPolicy<
       HipKernel<
-      statement::Tile<0, statement::tile_fixed<32>, hip_block_x_loop,
-        statement::Tile<1, statement::tile_fixed<32>, hip_block_y_loop,
+      statement::Tile<0, tile_fixed<32>, hip_block_x_loop,
+        statement::Tile<1, tile_fixed<32>, hip_block_y_loop,
           For<0, hip_thread_x_direct,
             For<1, hip_thread_y_direct,
               For<2, RAJA::seq_exec, Lambda<0>>>>>>>>;
@@ -990,13 +1001,12 @@ TEST(Kernel, FissionFusion)
 {
   using namespace RAJA;
 
-
   // Loop Fusion
-  using Pol_Fusion = KernelPolicy<For<0, seq_exec, Lambda<0>, Lambda<1>>>;
+  using Pol_Fusion = KernelPolicy<For<0, seq_exec, Lambda<0, Segs<0>>, Lambda<1, Segs<0>>>>;
 
   // Loop Fission
   using Pol_Fission =
-      KernelPolicy<For<0, seq_exec, Lambda<0>>, For<0, seq_exec, Lambda<1>>>;
+      KernelPolicy<For<0, seq_exec, Lambda<0, Segs<0>>>, For<0, seq_exec, Lambda<1, Segs<0>>>>;
 
 
   constexpr int N = 16;
@@ -1011,18 +1021,18 @@ TEST(Kernel, FissionFusion)
 
       RAJA::make_tuple(RangeSegment(0, N), RangeSegment(0, N)),
 
-      [=](int i, int) { x[i] += 1; },
+      [=](int i) { x[i] += 1; },
 
-      [=](int i, int) { x[i] += 2; });
+      [=](int i) { x[i] += 2; });
 
 
   kernel<Pol_Fusion>(
 
       RAJA::make_tuple(RangeSegment(0, N), RangeSegment(0, N)),
 
-      [=](int i, int) { y[i] += 1; },
+      [=](int i) { y[i] += 1; },
 
-      [=](int i, int) { y[i] += 2; });
+      [=](int i) { y[i] += 2; });
 
   for (int i = 0; i < N; ++i) {
     ASSERT_EQ(x[i], y[i]);
@@ -1041,10 +1051,10 @@ TEST(Kernel, FissionFusion_Conditional)
   // Loop Fission if param == 1
 
   using Pol = KernelPolicy<
-      If<Equals<Param<0>, Value<0>>, For<0, seq_exec, Lambda<0>, Lambda<1>>>,
+      If<Equals<Param<0>, Value<0>>, For<0, seq_exec, Lambda<0, Segs<0>>, Lambda<1, Segs<0>>>>,
       If<Equals<Param<0>, Value<1>>,
-         For<0, seq_exec, Lambda<0>>,
-         For<0, seq_exec, Lambda<1>>>>;
+         For<0, seq_exec, Lambda<0, Segs<0>>>,
+         For<0, seq_exec, Lambda<1, Segs<0>>>>>;
 
 
   constexpr int N = 16;
@@ -1063,9 +1073,9 @@ TEST(Kernel, FissionFusion_Conditional)
 
         RAJA::make_tuple(param),
 
-        [=](int i, int, int) { x[i] += 1; },
+        [=](int i) { x[i] += 1; },
 
-        [=](int i, int, int) { x[i] += 2; });
+        [=](int i) { x[i] += 2; });
 
     for (int i = 0; i < N; ++i) {
       ASSERT_EQ(x[i], 3 + 3 * param);
@@ -1203,11 +1213,11 @@ TEST(Kernel, Tile)
   // Loop Fusion
   using Pol = KernelPolicy<
       statement::Tile<1,
-                      statement::tile_fixed<4>,
+                      tile_fixed<4>,
                       seq_exec,
-                      For<0, seq_exec, For<1, seq_exec, Lambda<0>>>,
-                      For<0, seq_exec, For<1, seq_exec, Lambda<0>>>>,
-      For<1, seq_exec, Lambda<1>>>;
+                      For<0, seq_exec, For<1, seq_exec, Lambda<0, Segs<0>>>>,
+                      For<0, seq_exec, For<1, seq_exec, Lambda<0, Segs<0>>>>>,
+      For<1, seq_exec, Lambda<1, Segs<1>>>>;
 
 
   constexpr int N = 16;
@@ -1220,8 +1230,8 @@ TEST(Kernel, Tile)
 
       RAJA::make_tuple(RangeSegment(0, N), RangeSegment(0, N)),
 
-      [=](RAJA::Index_type i, RAJA::Index_type) { x[i] += 1; },
-      [=](RAJA::Index_type, RAJA::Index_type j) { x[j] *= 10; });
+      [=](RAJA::Index_type i) { x[i] += 1; },
+      [=](RAJA::Index_type j) { x[j] *= 10; });
 
   for (int i = 0; i < N; ++i) {
     ASSERT_EQ(x[i], 320);
@@ -1241,7 +1251,7 @@ TEST(Kernel, TileTCount)
   // Loop Fusion
   using Pol = KernelPolicy<
       statement::TileTCount<0, Param<0>,
-                      statement::tile_fixed<T>, seq_exec,
+                      tile_fixed<T>, seq_exec,
                       For<0, seq_exec, Lambda<0>>>>;
 
 
@@ -1292,7 +1302,7 @@ TEST(Kernel, TileTCountTyped)
   // Loop Fusion
   using Pol = KernelPolicy<
       statement::TileTCount<0, Param<0>,
-                      statement::tile_fixed<T>, seq_exec,
+                      tile_fixed<T>, seq_exec,
                       For<0, seq_exec, Lambda<0>>>>;
 
 
@@ -1360,121 +1370,7 @@ TEST(Kernel, CollapseSeq)
   delete[] x;
 }
 
-//Sequential region
-TEST(Kernel, RegionSeq)
-{
-
-  const int N = 300;
-  int * Arr_a = new int[N];
-  int * Arr_b = new int[N];
-  int * Arr_c = new int[N];
-
-  for(int i=0; i<N; ++i) {
-    Arr_a[i] = 0;
-    Arr_b[i] = 0;
-    Arr_c[i] = 0;
-  }
-
-  using Pol =
-    RAJA::KernelPolicy<
-      RAJA::statement::Region<RAJA::seq_region,
-        RAJA::statement::For<0, RAJA::loop_exec,
-          RAJA::statement::Lambda<0>
-        >,
-        RAJA::statement::For<0, RAJA::loop_exec,
-          RAJA::statement::Lambda<1>
-        >,
-        RAJA::statement::For<0, RAJA::loop_exec,
-          RAJA::statement::Lambda<2>
-        >
-      >
-    >;
-
-  RAJA::kernel<Pol>(
-    RAJA::make_tuple(RAJA::RangeSegment(0,N)),
-
-    [=] (int i) {
-      Arr_a[i] = 50;
-    },
-
-    [=] (int i) {
-      Arr_b[i] = 100;
-    },
-
-    [=] (int i) {
-      Arr_c[i] = Arr_a[i] + Arr_b[N - 1 - i];
-    }
-
-   );
-
-  for(int i=0; i<N; ++i) {
-    ASSERT_EQ(Arr_c[i], 150);
-  }
-
-  delete [] Arr_a;
-  delete [] Arr_b;
-  delete [] Arr_c;
-}
-
-
 #if defined(RAJA_ENABLE_OPENMP)
-TEST(Kernel, RegionOMP)
-{
-
-  const int N = 300;
-  int * Arr_a = new int[N];
-  int * Arr_b = new int[N];
-  int * Arr_c = new int[N];
-
-  for(int i=0; i<N; ++i) {
-    Arr_a[i] = 0;
-    Arr_b[i] = 0;
-    Arr_c[i] = 0;
-  }
-
-  using Pol =
-    RAJA::KernelPolicy<
-      RAJA::statement::Region<RAJA::omp_parallel_region,
-        RAJA::statement::For<0, RAJA::omp_for_nowait_exec,
-          RAJA::statement::Lambda<0>
-        >,
-        RAJA::statement::For<0, RAJA::omp_for_nowait_exec,
-          RAJA::statement::Lambda<1>
-        >,
-        RAJA::statement::OmpSyncThreads,
-        RAJA::statement::For<0, RAJA::omp_for_nowait_exec,
-          RAJA::statement::Lambda<2>
-        >
-      >
-    >;
-
-  RAJA::kernel<Pol>(
-    RAJA::make_tuple(RAJA::RangeSegment(0,N)),
-
-    [=] (int i) {
-      Arr_a[i] = 50;
-    },
-
-    [=] (int i) {
-      Arr_b[i] = 100;
-    },
-
-    [=] (int i) {
-      Arr_c[i] = Arr_a[i] + Arr_b[N - 1 - i];
-    }
-
-   );
-
-  for(int i=0; i<N; ++i) {
-    ASSERT_EQ(Arr_c[i], 150);
-  }
-
-  delete [] Arr_a;
-  delete [] Arr_b;
-  delete [] Arr_c;
-}
-
-
 TEST(Kernel, Collapse2)
 {
   int N = 16;
@@ -1764,7 +1660,7 @@ TEST(Kernel, ReduceSeqSum)
       RAJA::statement::For<0, seq_exec,
         Lambda<0>,
         RAJA::statement::Reduce<seq_reduce, RAJA::operators::plus, Param<0>,
-          Lambda<1>
+          Lambda<1, Params<0>>
         >
       >
      >;
@@ -1780,7 +1676,7 @@ TEST(Kernel, ReduceSeqSum)
       [=](Index_type i, int &value) {
         value = data[i];
       },
-      [=](Index_type, int &value) {
+      [=](int &value) {
         (*sumPtr) += value;
       });
 
@@ -1803,7 +1699,7 @@ GPU_TEST(Kernel, ReduceCudaSum1)
       KernelPolicy<CudaKernel<
         For<0, cuda_thread_x_loop, Lambda<0>>,
         RAJA::statement::Reduce<cuda_block_reduce, RAJA::operators::plus, Param<0>,
-          Lambda<1>
+          Lambda<1, Params<0>>
         >
       >>;
 
@@ -1817,7 +1713,7 @@ GPU_TEST(Kernel, ReduceCudaSum1)
       [=] __device__ (Index_type i, long &value) {
         value += i;
       },
-      [=] __device__ (Index_type, long &value) {
+      [=] __device__ (long &value) {
         // This only gets executed on the "root" thread which reecieved the
         // reduced value
         trip_count += value;
@@ -2098,7 +1994,7 @@ GPU_TEST(Kernel, ReduceCudaWarpLoop1)
       KernelPolicy<CudaKernel<
         For<0, cuda_warp_loop, Lambda<0>>,
         RAJA::statement::Reduce<cuda_warp_reduce, RAJA::operators::plus, Param<0>,
-          Lambda<1>
+          Lambda<1, Params<0>>
         >
       >>;
 
@@ -2113,7 +2009,7 @@ GPU_TEST(Kernel, ReduceCudaWarpLoop1)
       [=] __device__ (Index_type i, long &value) {
         value += i;
       },
-      [=] __device__ (Index_type, long &value) {
+      [=] __device__ (long &value) {
         // This only gets executed on the "root" thread which recieved the
         // reduced value
         total_count += value;
@@ -2139,7 +2035,7 @@ GPU_TEST(Kernel, ReduceCudaWarpLoop2)
           For<0, cuda_warp_loop, Lambda<0>>
         >,
         RAJA::statement::Reduce<cuda_warp_reduce, RAJA::operators::plus, Param<0>,
-          Lambda<1>
+          Lambda<1, Params<0>>
         >
       >>;
 
@@ -2155,7 +2051,7 @@ GPU_TEST(Kernel, ReduceCudaWarpLoop2)
       [=] __device__ (Index_type i, Index_type j, long &value) {
         value += i + j*N;
       },
-      [=] __device__ (Index_type, Index_type, long &value) {
+      [=] __device__ (long &value) {
         // This only gets executed on the "root" thread which recieved the
         // reduced value
         total_count += value;
@@ -2185,7 +2081,7 @@ GPU_TEST(Kernel, ReduceCudaWarpLoop3)
           >,
           RAJA::statement::CudaSyncWarp,
           RAJA::statement::Reduce<cuda_warp_reduce, RAJA::operators::plus, Param<0>,
-            Lambda<1>
+            Lambda<1, Params<0>>
           >
         >
       >>;
@@ -2202,7 +2098,7 @@ GPU_TEST(Kernel, ReduceCudaWarpLoop3)
       [=] __device__ (Index_type i, Index_type j, Index_type k, long &value) {
         value = i + j*N + k*N*M;
       },
-      [=] __device__ (Index_type, Index_type, Index_type, long &value) {
+      [=] __device__ (long &value) {
         // This only gets executed on the "root" thread which reecieved the
         // reduced value
         total_count += value;
@@ -2227,7 +2123,7 @@ GPU_TEST(Kernel, CudaExec)
   // Loop Fusion
   using Pol =
       KernelPolicy<CudaKernel<
-       statement::Tile<0, statement::tile_fixed<32>, cuda_block_x_loop,
+       statement::Tile<0, tile_fixed<32>, cuda_block_x_loop,
          For<0, cuda_thread_x_direct, Lambda<0>>>>>;
 
 
@@ -2259,7 +2155,7 @@ GPU_TEST(Kernel, CudaForICount)
   // Loop Fusion
   using Pol =
       KernelPolicy<CudaKernel<
-       statement::Tile<0, statement::tile_fixed<T>, cuda_block_x_loop,
+       statement::Tile<0, tile_fixed<T>, cuda_block_x_loop,
          ForICount<0, Param<0>, cuda_thread_x_direct, Lambda<0>>>>>;
 
 
@@ -2307,7 +2203,7 @@ GPU_TEST(Kernel, CudaTileTCount)
   // Loop Fusion
   using Pol =
       KernelPolicy<CudaKernel<
-       statement::TileTCount<0, Param<0>, statement::tile_fixed<T>, cuda_block_x_loop,
+       statement::TileTCount<0, Param<0>, tile_fixed<T>, cuda_block_x_loop,
          For<0, cuda_thread_x_direct, Lambda<0>>>>>;
 
 
@@ -2352,7 +2248,7 @@ GPU_TEST(Kernel, CudaConditional)
 
   // Loop Fusion
   using Pol = KernelPolicy<CudaKernel<
-      statement::Tile<0, statement::tile_fixed<32>, cuda_block_x_loop,
+      statement::Tile<0, tile_fixed<32>, cuda_block_x_loop,
         For<0, cuda_thread_x_loop, If<Param<0>, Lambda<0>>, Lambda<1>>>>>;
 
   for (int param = 0; param < 2; ++param) {
@@ -2414,9 +2310,9 @@ GPU_TEST(Kernel, CudaExec1a)
   constexpr long N = (long)128;
 
   using Pol = KernelPolicy<CudaKernel<
-      statement::Tile<0, statement::tile_fixed<16>, cuda_block_x_loop,
-        statement::Tile<1, statement::tile_fixed<32>, cuda_block_y_loop,
-          statement::Tile<2, statement::tile_fixed<128>, cuda_block_z_loop,
+      statement::Tile<0, tile_fixed<16>, cuda_block_x_loop,
+        statement::Tile<1, tile_fixed<32>, cuda_block_y_loop,
+          statement::Tile<2, tile_fixed<128>, cuda_block_z_loop,
             For<0, cuda_thread_x_direct,
               For<1, cuda_thread_y_direct,
                 For<2, cuda_thread_z_loop,
@@ -2487,9 +2383,9 @@ GPU_TEST(Kernel, CudaExec1ac)
   constexpr long N = (long)128;
 
   using Pol = KernelPolicy<CudaKernel<
-      statement::Tile<0, statement::tile_fixed<16>, cuda_block_x_loop,
-        statement::Tile<1, statement::tile_fixed<16>, cuda_block_y_loop,
-          statement::Tile<2, statement::tile_fixed<16>, cuda_block_z_loop,
+      statement::Tile<0, tile_fixed<16>, cuda_block_x_loop,
+        statement::Tile<1, tile_fixed<16>, cuda_block_y_loop,
+          statement::Tile<2, tile_fixed<16>, cuda_block_z_loop,
             For<0, cuda_thread_x_loop,
               For<1, cuda_thread_y_loop,
                 For<2, cuda_thread_z_direct,
@@ -2527,7 +2423,7 @@ GPU_TEST(Kernel, CudaExec1b)
   // Loop Fusion
   using Pol =
       KernelPolicy<CudaKernel<
-        statement::Tile<0, statement::tile_fixed<128>, cuda_block_z_loop,
+        statement::Tile<0, tile_fixed<128>, cuda_block_z_loop,
           For<0, cuda_thread_y_loop, Lambda<0>>>>>;
 
 
@@ -2557,7 +2453,7 @@ GPU_TEST(Kernel, CudaExec1c)
   // Loop Fusion
   using Pol = KernelPolicy<
       CudaKernelExt<cuda_explicit_launch<false, 5, 3>,
-           statement::Tile<2, statement::tile_fixed<2>, cuda_block_z_loop,
+           statement::Tile<2, tile_fixed<2>, cuda_block_z_loop,
                     For<0, cuda_block_x_loop,
                         For<1, cuda_block_y_loop,
                             For<2, cuda_thread_z_loop, Lambda<0>>>>>>>;
@@ -2592,7 +2488,7 @@ GPU_TEST(Kernel, CudaComplexNested)
   using Pol = KernelPolicy<CudaKernel<
       For<0, cuda_block_x_loop,
           For<1, cuda_thread_x_loop, For<2, cuda_thread_y_loop, Lambda<0>>>,
-          For<2, cuda_thread_x_loop, Lambda<0>>>>>;
+          For<2, cuda_thread_x_loop, Lambda<0, Segs<0>, ValuesT<Index_type, 0>, Segs<2>>>>>>;
 
   int *ptr = nullptr;
   cudaErrchk(cudaMallocManaged(&ptr, sizeof(int) * N));
@@ -2766,7 +2662,7 @@ GPU_TEST(Kernel, CudaExec_tile1threaddirect)
   // Loop Fusion
   using Pol = KernelPolicy<
       CudaKernel<statement::Tile<0,
-                                 statement::tile_fixed<128>,
+                                 tile_fixed<128>,
                                  seq_exec,
                                  For<0, cuda_thread_x_direct, Lambda<0>>>>>;
 
@@ -2924,8 +2820,8 @@ GPU_TEST(Kernel, Hyperplane_cuda_3d_tiled)
 
   using Pol = RAJA::KernelPolicy<CudaKernel<
       For<0, cuda_block_x_loop,
-        RAJA::statement::Tile<2, RAJA::statement::tile_fixed<13>, seq_exec,
-          RAJA::statement::Tile<3, RAJA::statement::tile_fixed<7>, seq_exec,
+        RAJA::statement::Tile<2, RAJA::tile_fixed<13>, seq_exec,
+          RAJA::statement::Tile<3, RAJA::tile_fixed<7>, seq_exec,
             For<2, cuda_thread_x_direct,
               For<3, cuda_thread_y_direct,
                 Hyperplane<1, seq_exec, ArgList<2, 3>,
@@ -3043,8 +2939,8 @@ GPU_TEST(Kernel, CudaExec_1threadexec)
 
   // Loop Fusion
   using Pol = KernelPolicy<CudaKernel<
-      statement::Tile<2, statement::tile_fixed<64>, cuda_block_z_loop,
-        statement::Tile<3, statement::tile_fixed<16>, seq_exec,
+      statement::Tile<2, tile_fixed<64>, cuda_block_z_loop,
+        statement::Tile<3, tile_fixed<16>, seq_exec,
           For<0, cuda_block_x_loop,
             For<1, cuda_block_y_loop,
               For<2, cuda_thread_x_direct,
@@ -3092,7 +2988,7 @@ GPU_TEST(Kernel, CudaExec_fixedspillexec)
 
   // Loop Fusion
   using Pol = KernelPolicy<CudaKernelFixed<1024,
-      statement::Tile<0, statement::tile_fixed<1024>, cuda_block_x_loop,
+      statement::Tile<0, tile_fixed<1024>, cuda_block_x_loop,
         For<0, cuda_thread_x_direct,
           Lambda<0>
         >
@@ -3148,7 +3044,7 @@ GPU_TEST(Kernel_gpu, ReduceHipSum1)
       KernelPolicy<HipKernel<
         For<0, hip_thread_x_loop, Lambda<0>>,
         RAJA::statement::Reduce<hip_block_reduce, RAJA::operators::plus, Param<0>,
-          Lambda<1>
+          Lambda<1, Params<0>>
         >
       >>;
 
@@ -3162,7 +3058,7 @@ GPU_TEST(Kernel_gpu, ReduceHipSum1)
       [=] __device__ (Index_type i, long &value) {
         value += i;
       },
-      [=] __device__ (Index_type, long &value) {
+      [=] __device__ (long &value) {
         // This only gets executed on the "root" thread which receieved the
         // reduced value
         trip_count += value;
@@ -3439,7 +3335,7 @@ GPU_TEST(Kernel_gpu, ReduceHipWarpLoop1)
       KernelPolicy<HipKernel<
         For<0, hip_warp_loop, Lambda<0>>,
         RAJA::statement::Reduce<hip_warp_reduce, RAJA::operators::plus, Param<0>,
-          Lambda<1>
+          Lambda<1, Params<0>>
         >
       >>;
 
@@ -3454,7 +3350,7 @@ GPU_TEST(Kernel_gpu, ReduceHipWarpLoop1)
       [=] __device__ (Index_type i, long &value) {
         value += i;
       },
-      [=] __device__ (Index_type, long &value) {
+      [=] __device__ (long &value) {
         // This only gets executed on the "root" thread which recieved the
         // reduced value
         total_count += value;
@@ -3479,7 +3375,7 @@ GPU_TEST(Kernel_gpu, ReduceHipWarpLoop2)
           For<0, hip_warp_loop, Lambda<0>>
         >,
         RAJA::statement::Reduce<hip_warp_reduce, RAJA::operators::plus, Param<0>,
-          Lambda<1>
+          Lambda<1, Params<0> >
         >
       >>;
 
@@ -3495,7 +3391,7 @@ GPU_TEST(Kernel_gpu, ReduceHipWarpLoop2)
       [=] __device__ (Index_type i, Index_type j, long &value) {
         value += i + j*N;
       },
-      [=] __device__ (Index_type, Index_type, long &value) {
+      [=] __device__ (long &value) {
         // This only gets executed on the "root" thread which recieved the
         // reduced value
         total_count += value;
@@ -3524,7 +3420,7 @@ GPU_TEST(Kernel_gpu, ReduceHipWarpLoop3)
           >,
           RAJA::statement::HipSyncThreads,
           RAJA::statement::Reduce<hip_warp_reduce, RAJA::operators::plus, Param<0>,
-            Lambda<1>
+            Lambda<1, Params<0>>
           >
         >
       >>;
@@ -3541,7 +3437,7 @@ GPU_TEST(Kernel_gpu, ReduceHipWarpLoop3)
       [=] __device__ (Index_type i, Index_type j, Index_type k, long &value) {
         value = i + j*N + k*N*M;
       },
-      [=] __device__ (Index_type, Index_type, Index_type, long &value) {
+      [=] __device__ (long &value) {
         // This only gets executed on the "root" thread which reecieved the
         // reduced value
         total_count += value;
@@ -3565,7 +3461,7 @@ GPU_TEST(Kernel_gpu, HipExec)
   // Loop Fusion
   using Pol =
       KernelPolicy<HipKernel<
-       statement::Tile<0, statement::tile_fixed<32>, hip_block_x_loop,
+       statement::Tile<0, tile_fixed<32>, hip_block_x_loop,
          For<0, hip_thread_x_direct, Lambda<0>>>>>;
 
 
@@ -3596,7 +3492,7 @@ GPU_TEST(Kernel_gpu, HipForICount)
   // Loop Fusion
   using Pol =
       KernelPolicy<HipKernel<
-       statement::Tile<0, statement::tile_fixed<T>, hip_block_x_loop,
+       statement::Tile<0, tile_fixed<T>, hip_block_x_loop,
          ForICount<0, Param<0>, hip_thread_x_direct, Lambda<0>>>>>;
 
 
@@ -3643,7 +3539,7 @@ GPU_TEST(Kernel_gpu, HipTileTCount)
   // Loop Fusion
   using Pol =
       KernelPolicy<HipKernel<
-       statement::TileTCount<0, Param<0>, statement::tile_fixed<T>, hip_block_x_loop,
+       statement::TileTCount<0, Param<0>, tile_fixed<T>, hip_block_x_loop,
          For<0, hip_thread_x_direct, Lambda<0>>>>>;
 
 
@@ -3687,7 +3583,7 @@ GPU_TEST(Kernel_gpu, HipConditional)
 
   // Loop Fusion
   using Pol = KernelPolicy<HipKernel<
-      statement::Tile<0, statement::tile_fixed<32>, hip_block_x_loop,
+      statement::Tile<0, tile_fixed<32>, hip_block_x_loop,
         For<0, hip_thread_x_loop, If<Param<0>, Lambda<0>>, Lambda<1>>>>>;
 
   for (int param = 0; param < 2; ++param) {
@@ -3747,9 +3643,9 @@ GPU_TEST(Kernel_gpu, HipExec1a)
   constexpr long N = (long)128;
 
   using Pol = KernelPolicy<HipKernel<
-      statement::Tile<0, statement::tile_fixed<16>, hip_block_x_loop,
-        statement::Tile<1, statement::tile_fixed<32>, hip_block_y_loop,
-          statement::Tile<2, statement::tile_fixed<128>, hip_block_z_loop,
+      statement::Tile<0, tile_fixed<16>, hip_block_x_loop,
+        statement::Tile<1, tile_fixed<32>, hip_block_y_loop,
+          statement::Tile<2, tile_fixed<128>, hip_block_z_loop,
             For<0, hip_thread_x_direct,
               For<1, hip_thread_y_direct,
                 For<2, hip_thread_z_loop,
@@ -3818,9 +3714,9 @@ GPU_TEST(Kernel_gpu, HipExec1ac)
   constexpr long N = (long)128;
 
   using Pol = KernelPolicy<HipKernel<
-      statement::Tile<0, statement::tile_fixed<16>, hip_block_x_loop,
-        statement::Tile<1, statement::tile_fixed<16>, hip_block_y_loop,
-          statement::Tile<2, statement::tile_fixed<16>, hip_block_z_loop,
+      statement::Tile<0, tile_fixed<16>, hip_block_x_loop,
+        statement::Tile<1, tile_fixed<16>, hip_block_y_loop,
+          statement::Tile<2, tile_fixed<16>, hip_block_z_loop,
             For<0, hip_thread_x_loop,
               For<1, hip_thread_y_loop,
                 For<2, hip_thread_z_direct,
@@ -3857,7 +3753,7 @@ GPU_TEST(Kernel_gpu, HipExec1b)
   // Loop Fusion
   using Pol =
       KernelPolicy<HipKernel<
-        statement::Tile<0, statement::tile_fixed<128>, hip_block_z_loop,
+        statement::Tile<0, tile_fixed<128>, hip_block_z_loop,
           For<0, hip_thread_y_loop, Lambda<0>>>>>;
 
 
@@ -3886,7 +3782,7 @@ GPU_TEST(Kernel_gpu, HipExec1c)
   // Loop Fusion
   using Pol = KernelPolicy<
       HipKernelExt<hip_explicit_launch<false, 5, 3>,
-           statement::Tile<2, statement::tile_fixed<2>, hip_block_z_loop,
+           statement::Tile<2, tile_fixed<2>, hip_block_z_loop,
                     For<0, hip_block_x_loop,
                         For<1, hip_block_y_loop,
                             For<2, hip_thread_z_loop, Lambda<0>>>>>>>;
@@ -3920,7 +3816,7 @@ GPU_TEST(Kernel_gpu, HipComplexNested)
   using Pol = KernelPolicy<HipKernel<
       For<0, hip_block_x_loop,
           For<1, hip_thread_x_loop, For<2, hip_thread_y_loop, Lambda<0>>>,
-          For<2, hip_thread_x_loop, Lambda<0>>>>>;
+          For<2, hip_thread_x_loop, Lambda<0, Segs<0>, ValuesT<RAJA::Index_type, 0>, Segs<2>>>>>>;
 
   int *ptr = (int*) malloc(sizeof(int) * N);
   int *d_ptr = nullptr;
@@ -4090,7 +3986,7 @@ GPU_TEST(Kernel_gpu, HipExec_tile1threaddirect)
   // Loop Fusion
   using Pol = KernelPolicy<
       HipKernel<statement::Tile<0,
-                                 statement::tile_fixed<128>,
+                                 tile_fixed<128>,
                                  seq_exec,
                                  For<0, hip_thread_x_direct, Lambda<0>>>>>;
 
@@ -4217,8 +4113,8 @@ GPU_TEST(Kernel_gpu, Hyperplane_hip_3d_tiled)
 
   using Pol = RAJA::KernelPolicy<HipKernel<
       For<0, hip_block_x_loop,
-        RAJA::statement::Tile<2, RAJA::statement::tile_fixed<13>, seq_exec,
-          RAJA::statement::Tile<3, RAJA::statement::tile_fixed<7>, seq_exec,
+        RAJA::statement::Tile<2, RAJA::tile_fixed<13>, seq_exec,
+          RAJA::statement::Tile<3, RAJA::tile_fixed<7>, seq_exec,
             For<2, hip_thread_x_direct,
               For<3, hip_thread_y_direct,
                 Hyperplane<1, seq_exec, ArgList<2, 3>,
@@ -4339,8 +4235,8 @@ GPU_TEST(Kernel_gpu, HipExec_1threadexec)
 
   // Loop Fusion
   using Pol = KernelPolicy<HipKernel<
-      statement::Tile<2, statement::tile_fixed<64>, hip_block_z_loop,
-        statement::Tile<3, statement::tile_fixed<16>, seq_exec,
+      statement::Tile<2, tile_fixed<64>, hip_block_z_loop,
+        statement::Tile<3, tile_fixed<16>, seq_exec,
           For<0, hip_block_x_loop,
             For<1, hip_block_y_loop,
               For<2, hip_thread_x_direct,
@@ -4385,7 +4281,7 @@ GPU_TEST(Kernel, HipExec_fixedspillexec)
 
   // Loop Fusion
   using Pol = KernelPolicy<HipKernelFixed<1024,
-      statement::Tile<0, statement::tile_fixed<1024>, hip_block_x_loop,
+      statement::Tile<0, tile_fixed<1024>, hip_block_x_loop,
         For<0, hip_thread_x_direct,
           Lambda<0>
         >
diff --git a/test/old-tests/unit/test-sharedmem.cpp b/test/old-tests/unit/test-sharedmem.cpp
new file mode 100644
index 0000000000..3ba9429508
--- /dev/null
+++ b/test/old-tests/unit/test-sharedmem.cpp
@@ -0,0 +1,969 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA_gtest.hpp"
+
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <cmath>
+#include <cassert>
+
+#include "camp/camp.hpp"
+#include "camp/concepts.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+#include <cuda_runtime.h>
+#endif
+
+using namespace RAJA;
+using namespace RAJA::statement;
+
+//Define tile size ( TILE_DIM x TILE_DIM )
+//Matrix transpose and matrix multiplication
+//are carried out via tiling algorithms
+RAJA_INDEX_VALUE(TX, "TX");
+RAJA_INDEX_VALUE(TY, "TY");
+
+const int TILE_DIM = 16;
+
+template <typename NestedPolicy>
+class TypedLocalMem : public ::testing::Test
+{
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+};
+TYPED_TEST_SUITE_P(TypedLocalMem);
+
+GPU_TYPED_TEST_P(TypedLocalMem, Basic)
+{
+  using Pol = at_v<TypeParam, 0>;
+
+  const int DIM = 2;
+  const int N_rows = 144;
+  const int N_cols = 255;
+
+  const int inner_Dim0 = TILE_DIM;
+  const int inner_Dim1 = TILE_DIM;
+
+  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
+  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
+
+  double *A, *B;
+#if defined(RAJA_ENABLE_CUDA)
+  size_t Arr_sz = N_rows * N_cols;
+  cudaErrchk(cudaMallocManaged(&A,  sizeof(double) * Arr_sz));
+  cudaErrchk(cudaMallocManaged(&B, sizeof(double)  * Arr_sz));
+#else
+  A  = new double[N_rows * N_cols];
+  B  = new double[N_rows * N_cols];
+#endif
+
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Aview(A, N_rows, N_cols);
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Bview(B, N_rows, N_cols);
+
+  for (int row = 0; row < N_rows; ++row) {
+    for (int col= 0 ; col < N_cols; ++col) {
+      A[col + N_cols*row] = col;
+    }
+  }
+
+  using SharedTile = AtomicTypedLocalArray<RAJA::auto_atomic, double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>, TY, TX>;
+  SharedTile myTile, myTile2;
+
+  const TX TX_TILE_DIM(16);
+  const TY TY_TILE_DIM(16);
+
+  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0), RAJA::TypedRangeSegment<TY>(0,inner_Dim1),
+                                           RAJA::TypedRangeSegment<TX>(0, outer_Dim0), RAJA::TypedRangeSegment<TY>(0,outer_Dim1)),
+                          RAJA::make_tuple(myTile, myTile2),
+
+  //Load data into shared memory
+  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
+
+    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
+
+    if(row < N_rows && col < N_cols){
+      myTile(ty,tx)   = Aview(row, col);
+    }
+
+  },
+
+  //read from shared mem
+  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
+
+    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
+
+    if(row < N_rows && col < N_cols){
+      Bview(row, col) = myTile(ty, tx);
+    }
+
+  });
+
+  //Check result
+  for (int row = 0; row < N_rows; ++row) {
+    for (int col = 0; col < N_cols; ++col) {
+      ASSERT_FLOAT_EQ((double)B[col + row*N_cols], (double)A[col + row*N_cols]);
+    }
+  }
+
+#if defined(RAJA_ENABLE_CUDA)
+  cudaErrchk(cudaFree(A));
+  cudaErrchk(cudaFree(B));
+#else
+  delete [] A;
+  delete [] B;
+#endif
+}
+
+REGISTER_TYPED_TEST_SUITE_P(TypedLocalMem, Basic);
+
+#if defined(RAJA_ENABLE_HIP)
+template <typename NestedPolicy>
+class TypedLocalMem_gpu : public ::testing::Test
+{
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+};
+TYPED_TEST_SUITE_P(TypedLocalMem_gpu);
+
+GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
+{
+  using Pol = at_v<TypeParam, 0>;
+
+  const int DIM = 2;
+  const int N_rows = 144;
+  const int N_cols = 255;
+
+  const int inner_Dim0 = TILE_DIM;
+  const int inner_Dim1 = TILE_DIM;
+
+  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
+  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
+
+  double *A, *B;
+  double *d_A, *d_B;
+  size_t Arr_sz = N_rows * N_cols;
+  hipMalloc(&d_A, sizeof(double) * Arr_sz);
+  hipMalloc(&d_B, sizeof(double) * Arr_sz);
+  A  = new double[N_rows * N_cols];
+  B  = new double[N_rows * N_cols];
+
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Aview(A, N_rows, N_cols);
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Bview(B, N_rows, N_cols);
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Aview(d_A, N_rows, N_cols);
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Bview(d_B, N_rows, N_cols);
+
+  for (int row = 0; row < N_rows; ++row) {
+    for (int col= 0 ; col < N_cols; ++col) {
+      A[col + N_cols*row] = col;
+    }
+  }
+
+  hipMemcpy(d_A, A, Arr_sz*sizeof(double), hipMemcpyHostToDevice);
+
+  using SharedTile = TypedLocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>, TY, TX>;
+  SharedTile myTile, myTile2;
+
+  const TX TX_TILE_DIM(16);
+  const TY TY_TILE_DIM(16);
+
+  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0), RAJA::TypedRangeSegment<TY>(0,inner_Dim1),
+                                           RAJA::TypedRangeSegment<TX>(0, outer_Dim0), RAJA::TypedRangeSegment<TY>(0,outer_Dim1)),
+                          RAJA::make_tuple(myTile, myTile2),
+
+  //Load data into shared memory
+  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
+
+    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
+
+    if(row < N_rows && col < N_cols){
+      myTile(ty,tx)   = d_Aview(row, col);
+    }
+
+  },
+
+  //read from shared mem
+  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
+
+    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
+
+    if(row < N_rows && col < N_cols){
+      d_Bview(row, col) = myTile(ty, tx);
+    }
+
+  });
+
+  hipMemcpy(B, d_B, Arr_sz*sizeof(double), hipMemcpyDeviceToHost);
+
+  //Check result
+  for (int row = 0; row < N_rows; ++row) {
+    for (int col = 0; col < N_cols; ++col) {
+      ASSERT_FLOAT_EQ(B[col + row*N_cols], A[col + row*N_cols]);
+    }
+  }
+
+  hipFree(d_A);
+  hipFree(d_B);
+  delete [] A;
+  delete [] B;
+}
+
+REGISTER_TYPED_TEST_SUITE_P(TypedLocalMem_gpu, Basic);
+#endif //defined(RAJA_ENABLE_HIP)
+
+
+//
+//Matrix transpose example - test all variants
+//
+template <typename NestedPolicy>
+class MatTranspose : public ::testing::Test
+{
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+};
+TYPED_TEST_SUITE_P(MatTranspose);
+
+GPU_TYPED_TEST_P(MatTranspose, Basic)
+{
+
+  using Pol = at_v<TypeParam, 0>;
+
+  const int DIM = 2;
+  const int N_rows = 144;
+  const int N_cols = 255;
+
+  const int inner_Dim0 = TILE_DIM;
+  const int inner_Dim1 = TILE_DIM;
+
+  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
+  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
+
+  double *A, *At, *B, *Bt;
+#if defined(RAJA_ENABLE_CUDA)
+  cudaErrchk(cudaMallocManaged(&A,  sizeof(double) * N_rows * N_cols));
+  cudaErrchk(cudaMallocManaged(&At, sizeof(double) * N_rows * N_cols));
+  cudaErrchk(cudaMallocManaged(&B,  sizeof(double) * N_rows * N_cols));
+  cudaErrchk(cudaMallocManaged(&Bt, sizeof(double) * N_rows * N_cols));
+#else
+  A  = new double[N_rows * N_cols];
+  At = new double[N_rows * N_cols];
+  B  = new double[N_rows * N_cols];
+  Bt = new double[N_rows * N_cols];
+#endif
+
+  RAJA::View<double, RAJA::Layout<DIM>> Aview(A, N_rows, N_cols);
+  RAJA::View<double, RAJA::Layout<DIM>> Atview(At, N_cols, N_rows);
+
+  RAJA::View<double, RAJA::Layout<DIM>> Bview(B, N_rows, N_cols);
+  RAJA::View<double, RAJA::Layout<DIM>> Btview(Bt, N_cols, N_rows);
+
+
+  for (int row = 0; row < N_rows; ++row) {
+    for (int col = 0; col < N_cols; ++col) {
+      Aview(row, col) = col;
+      Bview(row, col) = col;
+    }
+  }
+
+
+  using SharedTile = LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>>;
+
+  SharedTile myTile, myTile2;
+
+  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0,inner_Dim1),
+                                           RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0,outer_Dim1)),
+                          RAJA::make_tuple(myTile, myTile2),
+
+  //Load data into shared memory
+  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
+
+    int col = bx * TILE_DIM + tx;  // Matrix column index
+    int row = by * TILE_DIM + ty;  // Matrix row index
+
+    if(row < N_rows && col < N_cols){
+      myTile(ty,tx)  = Aview(row, col);
+      myTile2(ty,tx) = Bview(row, col);
+    }
+
+  },
+
+  //read from shared mem
+  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
+
+    int col = by * TILE_DIM + tx;  // Transposed matrix column index
+    int row = bx * TILE_DIM + ty;  // Transposed matrix row index
+
+    if(row < N_cols && col < N_rows){
+      Atview(row, col) = myTile(tx,ty);
+      Btview(row, col) = myTile2(tx,ty);
+    }
+
+  });
+
+  //Check result
+  for (int row = 0; row < N_rows; ++row) {
+    for (int col = 0; col < N_cols; ++col) {
+      ASSERT_FLOAT_EQ((double)Atview(col,row), (double)col);
+      ASSERT_FLOAT_EQ((double)Btview(col,row), (double)col);
+    }
+  }
+
+
+#if defined(RAJA_ENABLE_CUDA)
+  cudaErrchk(cudaFree(A));
+  cudaErrchk(cudaFree(At));
+  cudaErrchk(cudaFree(B));
+  cudaErrchk(cudaFree(Bt));
+#else
+  delete [] A;
+  delete [] At;
+  delete [] B;
+  delete [] Bt;
+#endif
+}
+
+REGISTER_TYPED_TEST_SUITE_P(MatTranspose, Basic);
+
+#if defined(RAJA_ENABLE_HIP)
+
+template <typename NestedPolicy>
+class MatTranspose_gpu : public ::testing::Test
+{
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+};
+TYPED_TEST_SUITE_P(MatTranspose_gpu);
+
+GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
+{
+
+  using Pol = at_v<TypeParam, 0>;
+
+  const int DIM = 2;
+  const int N_rows = 144;
+  const int N_cols = 255;
+
+  const int inner_Dim0 = TILE_DIM;
+  const int inner_Dim1 = TILE_DIM;
+
+  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
+  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
+
+  double *A, *At, *B, *Bt;
+  double *d_A, *d_At, *d_B, *d_Bt;
+  hipMalloc(&d_A,  sizeof(double) * N_rows * N_cols);
+  hipMalloc(&d_At, sizeof(double) * N_rows * N_cols);
+  hipMalloc(&d_B,  sizeof(double) * N_rows * N_cols);
+  hipMalloc(&d_Bt, sizeof(double) * N_rows * N_cols);
+  A  = new double[N_rows * N_cols];
+  At = new double[N_rows * N_cols];
+  B  = new double[N_rows * N_cols];
+  Bt = new double[N_rows * N_cols];
+
+  RAJA::View<double, RAJA::Layout<DIM>> Aview(A, N_rows, N_cols);
+  RAJA::View<double, RAJA::Layout<DIM>> Atview(At, N_cols, N_rows);
+
+  RAJA::View<double, RAJA::Layout<DIM>> Bview(B, N_rows, N_cols);
+  RAJA::View<double, RAJA::Layout<DIM>> Btview(Bt, N_cols, N_rows);
+
+  RAJA::View<double, RAJA::Layout<DIM>> d_Aview(d_A, N_rows, N_cols);
+  RAJA::View<double, RAJA::Layout<DIM>> d_Atview(d_At, N_cols, N_rows);
+
+  RAJA::View<double, RAJA::Layout<DIM>> d_Bview(d_B, N_rows, N_cols);
+  RAJA::View<double, RAJA::Layout<DIM>> d_Btview(d_Bt, N_cols, N_rows);
+
+
+  for (int row = 0; row < N_rows; ++row) {
+    for (int col = 0; col < N_cols; ++col) {
+      Aview(row, col) = col;
+      Bview(row, col) = col;
+    }
+  }
+
+  hipMemcpy(d_A, A, N_rows * N_cols * sizeof(double), hipMemcpyHostToDevice);
+  hipMemcpy(d_B, B, N_rows * N_cols * sizeof(double), hipMemcpyHostToDevice);
+
+
+  using SharedTile = LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>>;
+
+  SharedTile myTile, myTile2;
+
+  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0,inner_Dim1),
+                                           RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0,outer_Dim1)),
+                          RAJA::make_tuple(myTile, myTile2),
+
+  //Load data into shared memory
+  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
+
+    int col = bx * TILE_DIM + tx;  // Matrix column index
+    int row = by * TILE_DIM + ty;  // Matrix row index
+
+    if(row < N_rows && col < N_cols){
+      myTile(ty,tx)  = d_Aview(row, col);
+      myTile2(ty,tx) = d_Bview(row, col);
+    }
+
+  },
+
+  //read from shared mem
+  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
+
+    int col = by * TILE_DIM + tx;  // Transposed matrix column index
+    int row = bx * TILE_DIM + ty;  // Transposed matrix row index
+
+    if(row < N_cols && col < N_rows){
+      d_Atview(row, col) = myTile(tx,ty);
+      d_Btview(row, col) = myTile2(tx,ty);
+    }
+
+  });
+
+  hipMemcpy(At, d_At, N_rows * N_cols * sizeof(double), hipMemcpyDeviceToHost);
+  hipMemcpy(Bt, d_Bt, N_rows * N_cols * sizeof(double), hipMemcpyDeviceToHost);
+
+  //Check result
+  for (int row = 0; row < N_rows; ++row) {
+    for (int col = 0; col < N_cols; ++col) {
+      ASSERT_FLOAT_EQ(Atview(col,row), col);
+      ASSERT_FLOAT_EQ(Btview(col,row), col);
+    }
+  }
+
+
+  hipFree(d_A);
+  hipFree(d_At);
+  hipFree(d_B);
+  hipFree(d_Bt);
+  delete [] A;
+  delete [] At;
+  delete [] B;
+  delete [] Bt;
+}
+
+REGISTER_TYPED_TEST_SUITE_P(MatTranspose_gpu, Basic);
+
+#endif //defined(RAJA_ENABLE_HIP)
+
+using SeqTypes =
+  ::testing::Types<
+  RAJA::list<
+    RAJA::KernelPolicy<
+        RAJA::statement::For<3, RAJA::loop_exec,
+          RAJA::statement::For<2, RAJA::loop_exec,
+
+          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
+
+              //Load data into shared memory
+              RAJA::statement::For<1, RAJA::loop_exec,
+                RAJA::statement::For<0, RAJA::loop_exec,
+                  RAJA::statement::Lambda<0>
+                                   >
+                                 >,
+
+                //Read data from shared memory
+                RAJA::statement::For<1, RAJA::loop_exec,
+                  RAJA::statement::For<0, RAJA::loop_exec,
+                    RAJA::statement::Lambda<1> > >
+
+              > //close shared memory scope
+            >//for 2
+        >//for 3
+      > //kernel policy
+    > //list
+  >; //types
+INSTANTIATE_TYPED_TEST_SUITE_P(Seq, MatTranspose, SeqTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(Seq, TypedLocalMem, SeqTypes);
+
+
+#if defined(RAJA_ENABLE_OPENMP)
+using TestTypes =
+  ::testing::Types<
+  RAJA::list<
+    RAJA::KernelPolicy<
+      RAJA::statement::For<3, RAJA::loop_exec,
+        RAJA::statement::For<2, RAJA::loop_exec,
+
+          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
+
+           //Load data into shared memory
+           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                     RAJA::ArgList<0, 1>,
+                                     RAJA::statement::Lambda<0>
+                                     >,
+
+           //Read data from shared memory
+           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                     RAJA::ArgList<0, 1>,
+                                     RAJA::statement::Lambda<1>
+                                     >
+                                 >
+        >//for 2
+       >//for 3
+       > //close policy
+     > //close list
+
+  ,RAJA::list<
+      RAJA::KernelPolicy<
+      RAJA::statement::For<3, RAJA::loop_exec,
+        RAJA::statement::For<2, RAJA::loop_exec,
+
+          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
+
+           //Load data into shared memory
+            RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
+              RAJA::statement::For<0, RAJA::loop_exec,
+                RAJA::statement::Lambda<0>
+              >
+             >,
+
+           //Read data from shared memory
+            RAJA::statement::For<1, RAJA::loop_exec,
+           RAJA::statement::For<0, RAJA::omp_parallel_for_exec,
+                                RAJA::statement::Lambda<1>
+           >
+          >
+         > //close shared mem window
+        > //2
+       >//3
+     >//close policy
+    > //close list
+  ,RAJA::list<
+    RAJA::KernelPolicy<
+      RAJA::statement::For<3, RAJA::omp_parallel_for_exec,
+        RAJA::statement::For<2, RAJA::loop_exec,
+
+          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
+
+           //Load data into shared memory
+           RAJA::statement::For<1, RAJA::loop_exec,
+              RAJA::statement::For<0, RAJA::loop_exec,
+                RAJA::statement::Lambda<0>
+              >
+             >,
+
+           //Read data from shared memory
+            RAJA::statement::For<1, RAJA::loop_exec,
+              RAJA::statement::For<0, RAJA::loop_exec,
+                RAJA::statement::Lambda<1>
+           >
+          >
+         > //close shared mem window
+        > //2
+       >//3
+      > //close policy list
+     > //close list
+  ,RAJA::list<
+    RAJA::KernelPolicy<
+           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                     RAJA::ArgList<2, 3>,
+
+          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem,RAJA::ParamList<0,1>,
+
+           //Load data into shared memory
+           RAJA::statement::For<1, RAJA::loop_exec,
+              RAJA::statement::For<0, RAJA::loop_exec,
+                RAJA::statement::Lambda<0>
+              >
+             >,
+
+           //Read data from shared memory
+            RAJA::statement::For<1, RAJA::loop_exec,
+              RAJA::statement::For<0, RAJA::loop_exec,
+                RAJA::statement::Lambda<1>
+           >
+          >
+         > //close shared mem window
+       >//outer collapsed
+      > //close policy list
+     > //close list
+   >;
+
+
+INSTANTIATE_TYPED_TEST_SUITE_P(OpenMP, MatTranspose, TestTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(OpenMP, TypedLocalMem, TestTypes);
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+
+using CUDATypes =
+  ::testing::Types<
+  RAJA::list<
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::For<3, RAJA::cuda_block_y_direct,
+          RAJA::statement::For<2, RAJA::cuda_block_x_direct,
+
+            RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<0,1>,
+
+              //Load data into shared memory
+              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
+                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
+                  RAJA::statement::Lambda<0> > >,
+              RAJA::statement::CudaSyncThreads,
+
+              //Read data from shared memory
+              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
+                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
+                  RAJA::statement::Lambda<1> > >,
+              RAJA::statement::CudaSyncThreads
+            > //close shared memory scope
+          >//for 2
+        >//for 3
+      > //CudaKernel
+    > //kernel policy
+  > //list
+  ,
+  RAJA::list<
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::For<3, RAJA::cuda_block_y_loop,
+          RAJA::statement::For<2, RAJA::cuda_block_x_loop,
+
+            RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<0,1>,
+
+              //Load data into shared memory
+              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
+                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
+                  RAJA::statement::Lambda<0> > >,
+              RAJA::statement::CudaSyncThreads,
+
+              //Read data from shared memory
+              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
+                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
+                  RAJA::statement::Lambda<1> > >,
+              RAJA::statement::CudaSyncThreads
+            > //close shared memory scope
+          >//for 2
+        >//for 3
+      > //CudaKernel
+    > //kernel policy
+  > //list
+  >; //types
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, MatTranspose, CUDATypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, TypedLocalMem, CUDATypes);
+
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+
+using HIPTypes =
+  ::testing::Types<
+  RAJA::list<
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::For<3, RAJA::hip_block_y_direct,
+          RAJA::statement::For<2, RAJA::hip_block_x_direct,
+
+            RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<0,1>,
+
+              //Load data into shared memory
+              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
+                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+                  RAJA::statement::Lambda<0>
+                >
+              >,
+              RAJA::statement::HipSyncThreads,
+
+              //Read data from shared memory
+              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
+                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+                  RAJA::statement::Lambda<1>
+                >
+              >,
+              RAJA::statement::HipSyncThreads
+            > //close shared memory scope
+          >//for 2
+        >//for 3
+      > //HipKernel
+    > //kernel policy
+  > //list
+  ,
+  RAJA::list<
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::For<3, RAJA::hip_block_y_loop,
+          RAJA::statement::For<2, RAJA::hip_block_x_loop,
+
+            RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<0,1>,
+
+              //Load data into shared memory
+              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
+                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+                  RAJA::statement::Lambda<0>
+                >
+              >,
+              RAJA::statement::HipSyncThreads,
+
+              //Read data from shared memory
+              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
+                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+                  RAJA::statement::Lambda<1>
+                >
+              >,
+              RAJA::statement::HipSyncThreads
+            > //close shared memory scope
+          >//for 2
+        >//for 3
+      > //HipKernel
+    > //kernel policy
+  > //list
+  >; //types
+INSTANTIATE_TYPED_TEST_SUITE_P(HIP, MatTranspose_gpu, HIPTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(HIP, TypedLocalMem_gpu, HIPTypes);
+
+#endif
+
+
+
+template <typename NestedPolicy>
+class MatMultiply : public ::testing::Test
+{
+  virtual void SetUp(){}
+  virtual void TearDown(){}
+};
+
+TYPED_TEST_SUITE_P(MatMultiply);
+
+GPU_TYPED_TEST_P(MatMultiply, shmem)
+{
+
+  using Pol = typename TypeParam::exec_policy;
+
+  static constexpr size_t N = TypeParam::N;
+  static constexpr size_t M = TypeParam::M;
+  static constexpr size_t P = TypeParam::P;
+
+  //Matrix A size: N x M
+  //Matrix B size: M x P
+  //Result C size: N x P
+
+  // Note: on CPU A==d_A, etc.
+  double *A, *d_A;
+  TypeParam::alloc_double(N*M, &A, &d_A);
+
+  double *B, *d_B;
+  TypeParam::alloc_double(M*P, &B, &d_B);
+
+  double *C, *d_C;
+  TypeParam::alloc_double(N*P, &C, &d_C);
+
+
+  double *C_sol = new double[N*P];
+
+  RAJA::View<double, RAJA::Layout<2>> C_solView(C_sol, N, P);
+
+  {
+    // Create solution using CPU bare loops
+    RAJA::View<double, RAJA::Layout<2>> Aview(A, N, M);
+    RAJA::View<double, RAJA::Layout<2>> Bview(B, M, P);
+    RAJA::View<double, RAJA::Layout<2>> Cview(C, N, P);
+    for (size_t row = 0; row < N; ++row) {
+      for (size_t col = 0; col < M; ++col) {
+        Aview(row, col) = ((double)col-row)/(N*M)+1;
+      }
+    }
+
+    for (size_t row = 0; row < M; ++row) {
+      for (size_t col = 0; col < P; ++col) {
+        Bview(row, col) = ((double)col+row)/(M*P)+1;
+      }
+    }
+
+    for(size_t r=0; r<N; ++r){
+      for(size_t c=0; c<P; ++c){
+        double dot = 0.0;
+        for(size_t k=0; k<M; ++k){
+          dot += Aview(r,k)*Bview(k,c);
+        }
+        C_solView(r,c) = dot;
+        Cview(r,c) = 0;
+      }
+    }
+  }
+
+  // Copy A, B and C to the device (NOP on CPU)
+  TypeParam::copy_d2h(N*M, d_A, A);
+  TypeParam::copy_d2h(M*P, d_B, B);
+  TypeParam::copy_d2h(N*P, d_C, C);
+
+  // Create device views of data
+  RAJA::View<double, RAJA::Layout<2>> Aview(d_A, N, M);
+  RAJA::View<double, RAJA::Layout<2>> Bview(d_B, M, P);
+  RAJA::View<double, RAJA::Layout<2>> Cview(d_C, N, P);
+
+  using Shmem      = typename TypeParam::Shmem;
+  using ThreadPriv = typename TypeParam::ThreadPriv;
+
+  Shmem aShared, bShared; //memory to be shared between threads
+  ThreadPriv pVal; //iteration dependent data
+
+  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, N),
+                                           RAJA::RangeSegment(0, M),
+                                           RAJA::RangeSegment(0, P)),
+                          RAJA::make_tuple(aShared, bShared, pVal),
+
+  // Zero out thread local memory for storing dot products
+  [=] RAJA_HOST_DEVICE (int tn, int tp, ThreadPriv &pVal) {
+
+    pVal(tn,tp) = 0.0;
+
+  },
+
+  // Load tile of A
+  [=] RAJA_HOST_DEVICE (int n, int m, int tn, int tm, Shmem &aShared) {
+
+     aShared(tn, tm) = Aview(n, m);
+
+  },
+
+  // Load tile of B
+  [=] RAJA_HOST_DEVICE (int m, int p, int tm, int tp, Shmem &bShared) {
+
+    bShared(tm, tp) = Bview(m, p);
+
+  },
+
+  // Do partial update in shmem
+  [=] RAJA_HOST_DEVICE (int tn, int tm, int tp, Shmem &aShared,  Shmem &bShared, ThreadPriv & pVal) {
+
+    pVal(tn,tp) += aShared(tn,tm) * bShared(tm, tp);
+
+  },
+
+  // Write out complete result
+  [=] RAJA_HOST_DEVICE (int n, int p, int tn, int tp,  ThreadPriv &pVal) {
+
+    Cview(n,p) = pVal(tn,tp);
+
+  });
+
+  // copy result back to host (NOP on CPU)
+  TypeParam::copy_d2h(N*P, C, d_C);
+
+  // Check result
+  RAJA::View<double, RAJA::Layout<2>> Cresult(C, N, P);
+  for (size_t row = 0; row < N; ++row) {
+    for (size_t col = 0; col < P; ++col) {
+      ASSERT_FLOAT_EQ((double)Cresult(row,col), (double)C_solView(row,col));
+    }
+  }
+
+  TypeParam::free_double(A, d_A);
+  TypeParam::free_double(B, d_B);
+  TypeParam::free_double(C, d_C);
+  delete [] C_sol;
+}
+
+REGISTER_TYPED_TEST_SUITE_P(MatMultiply, shmem);
+
+void alloc_cpu(size_t N, double **host, double **device){
+  *host = new double[N];
+  *device = *host;
+}
+
+void copy_h2d_cpu(size_t , double *, double *){
+  // NOP
+}
+
+void copy_d2h_cpu(size_t , double *, double *){
+  // NOP
+}
+
+void free_cpu(double *host, double *){
+  delete[] host;
+}
+
+struct Policy_MatMultiply_cpu {
+
+    static constexpr size_t N = 150;
+    static constexpr size_t M = 25;
+    static constexpr size_t P = 95;
+    static constexpr size_t tile_size = 16;
+
+    constexpr static void(*alloc_double)(size_t, double**, double**) = alloc_cpu;
+    constexpr static void(*copy_h2d)(size_t, double*, double*) = copy_h2d_cpu;
+    constexpr static void(*copy_d2h)(size_t, double*, double*) = copy_d2h_cpu;
+    constexpr static void(*free_double)(double*, double*) = free_cpu;
+
+    using Shmem      = RAJA::LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<tile_size, tile_size>>;
+    using ThreadPriv = RAJA::LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<tile_size, tile_size>>;
+
+    using shmem_Lambda0 = RAJA::statement::Lambda<0, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
+    using shmem_Lambda1 = RAJA::statement::Lambda<1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>, RAJA::Params<0>>;
+    using shmem_Lambda2 = RAJA::statement::Lambda<2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>, RAJA::Params<1>>;
+    using shmem_Lambda3 = RAJA::statement::Lambda<3, RAJA::Offsets<0, 1, 2>, RAJA::Params<0, 1, 2>>;
+    using shmem_Lambda4 = RAJA::statement::Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
+
+    // Segments:
+    // 0: N
+    // 1: M
+    // 2: P
+
+    using exec_policy =
+        RAJA::KernelPolicy<
+          //Initalize thread private value
+          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2,1,0>,
+
+            // Tile of N and P (the result matrix C)
+            RAJA::statement::Tile<0, RAJA::tile_fixed<tile_size>, RAJA::loop_exec,
+              RAJA::statement::Tile<2, RAJA::tile_fixed<tile_size>, RAJA::loop_exec,
+
+               // zero out shmem tile of C
+               RAJA::statement::For<2, RAJA::loop_exec,
+                  RAJA::statement::For<0, RAJA::loop_exec,
+                  shmem_Lambda0 > >,
+
+                // Slide window across matrix: Tile in M
+                RAJA::statement::Tile<1, RAJA::tile_fixed<tile_size>, RAJA::loop_exec,
+
+                   // Load tile of A into shmem
+                   RAJA::statement::For<1, RAJA::loop_exec,
+                     RAJA::statement::For<0, RAJA::loop_exec,
+                     shmem_Lambda1
+                    >
+                   >,
+
+                   // Load tile of B into shmem
+                   RAJA::statement::For<2, RAJA::loop_exec,
+                     RAJA::statement::For<1, RAJA::loop_exec,
+                     shmem_Lambda2
+                    >
+                   >,
+
+                   //Partial multiplication
+                   RAJA::statement::For<2, RAJA::loop_exec,
+                     RAJA::statement::For<1, RAJA::loop_exec,
+                       RAJA::statement::For<0, RAJA::loop_exec,
+                       shmem_Lambda3
+                       >
+                     >
+                   >
+                >, //sliding window
+
+                //Write memory out to global matrix
+                RAJA::statement::For<2, RAJA::loop_exec,
+                  RAJA::statement::For<0, RAJA::loop_exec,
+                  shmem_Lambda4 > >
+             >
+            >
+           > //Create shared memory
+          >;
+};
+
+using MatMultiplyTypes = ::testing::Types<Policy_MatMultiply_cpu>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Seq, MatMultiply, MatMultiplyTypes);
diff --git a/test/unit/test-simd.cpp b/test/old-tests/unit/test-simd.cpp
similarity index 96%
rename from test/unit/test-simd.cpp
rename to test/old-tests/unit/test-simd.cpp
index ff79ba1ccf..4f99e7126c 100644
--- a/test/unit/test-simd.cpp
+++ b/test/old-tests/unit/test-simd.cpp
@@ -41,7 +41,7 @@ TEST(SIMD, Align)
                                 [=](int i) { y[i] += x[i] * c; });
 
   for (int i = 0; i < N; ++i) {
-    ASSERT_FLOAT_EQ(y[i], 1.0);
+    ASSERT_DOUBLE_EQ((double)y[i], (double)1.0);
   }
 
   RAJA::free_aligned(a);
@@ -80,7 +80,7 @@ TEST(SIMD, OMPAndSimd)
                     });
 
   for (int i = 0; i < N * M; ++i) {
-    ASSERT_FLOAT_EQ(c[i], 2.0);
+    ASSERT_DOUBLE_EQ((double)c[i], (double)2.0);
   }
 
   RAJA::free_aligned(a);
@@ -135,8 +135,8 @@ TEST(SIMD, OMPAndSimd_MultiLambda)
                     });
 
   for (int i = 0; i < N * M; ++i) {
-    ASSERT_FLOAT_EQ(c[i], 2.0);
-    ASSERT_FLOAT_EQ(c2[i], 2.0);
+    ASSERT_DOUBLE_EQ((double)c[i], (double)2.0);
+    ASSERT_DOUBLE_EQ((double)c2[i], (double)2.0);
   }
 
   RAJA::free_aligned(a);
@@ -181,7 +181,7 @@ TEST(SIMD, TBBAndSimd)
                     });
 
   for (int i = 0; i < N * M; ++i) {
-    ASSERT_FLOAT_EQ(c[i], 2.0);
+    ASSERT_DOUBLE_EQ(c[i], 2.0);
   }
 
   RAJA::free_aligned(a);
@@ -237,8 +237,8 @@ TEST(SIMD, TBBAndSimd_MultiLambda)
                     });
 
   for (int i = 0; i < N * M; ++i) {
-    ASSERT_FLOAT_EQ(c[i], 2.0);
-    ASSERT_FLOAT_EQ(c2[i], 2.0);
+    ASSERT_DOUBLE_EQ(c[i], 2.0);
+    ASSERT_DOUBLE_EQ(c2[i], 2.0);
   }
 
   RAJA::free_aligned(a);
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 9b2eb4c6b2..5a9d802a2f 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -5,84 +5,12 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-raja_add_test(
-  NAME test-atomic
-  SOURCES test-atomic.cpp)
-
-raja_add_test(
-  NAME test-atomic-ref
-  SOURCES test-atomic-ref.cpp)
-
-raja_add_test(
-  NAME test-atomic-ref-auto
-  SOURCES test-atomic-ref-auto.cpp)
-
-raja_add_test(
-  NAME test-region
-  SOURCES test-region.cpp)
-
-raja_add_test(
-  NAME test-layout
-  SOURCES test-layout.cpp)
-
-raja_add_test(
-  NAME test-view
-  SOURCES test-view.cpp)
-
-raja_add_test(
-  NAME test-timer
-  SOURCES test-timer.cpp)
-
-raja_add_test(
-  NAME test-integral-limits
-  SOURCES test-integral-limits.cpp)
-
-raja_add_test(
-  NAME test-rajavec
-  SOURCES test-rajavec.cpp)
-
-raja_add_test(
-  NAME test-iterators
-  SOURCES test-iterators.cpp)
-
-raja_add_test(
-  NAME test-indexvalue
-  SOURCES test-indexvalue.cpp)
-
-raja_add_test(
-  NAME test-span
-  SOURCES test-span.cpp)
-
-raja_add_test(
-  NAME test-kernel
-  SOURCES test-kernel.cpp)
-
-raja_add_test(
-  NAME test-multipolicy
-  SOURCES test-multipolicy.cpp)
-
-raja_add_test(
-  NAME test-simd
-  SOURCES test-simd.cpp)
-
-raja_add_test(
-  NAME test-sharedmem
-  SOURCES test-sharedmem.cpp)
-
-raja_add_test(
-  NAME test-kernel-lambda-args
-  SOURCES test-kernel-lambda-args.cpp)
-
-add_subdirectory(cpu)
-
-if(ENABLE_CUDA)
-  add_subdirectory(cuda)
-endif(ENABLE_CUDA)
-
-if(ENABLE_HIP)
-  add_subdirectory(hip)
-endif(ENABLE_HIP)
-
-if(ENABLE_TARGET_OPENMP)
-  add_subdirectory(omp-target)
-endif(ENABLE_TARGET_OPENMP)
+add_subdirectory(index)
+add_subdirectory(internal)
+add_subdirectory(util)
+add_subdirectory(reducer)
+add_subdirectory(resource)
+add_subdirectory(atomic)
+add_subdirectory(view-layout)
+add_subdirectory(algorithm)
+add_subdirectory(workgroup)
diff --git a/test/unit/algorithm/CMakeLists.txt b/test/unit/algorithm/CMakeLists.txt
new file mode 100644
index 0000000000..dc85cdf833
--- /dev/null
+++ b/test/unit/algorithm/CMakeLists.txt
@@ -0,0 +1,92 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+list(APPEND SORT_BACKENDS Sequential)
+
+if(RAJA_ENABLE_OPENMP)
+  list(APPEND SORT_BACKENDS OpenMP)
+endif()
+
+if(RAJA_ENABLE_TBB)
+  list(APPEND SORT_BACKENDS TBB)
+endif()
+
+if(RAJA_ENABLE_CUDA)
+  list(APPEND SORT_BACKENDS Cuda)
+endif()
+
+if(RAJA_ENABLE_HIP)
+  list(APPEND SORT_BACKENDS Hip)
+endif()
+
+# if(RAJA_ENABLE_TARGET_OPENMP)
+#   list(APPEND SORT_BACKENDS OpenMPTarget)
+# endif()
+
+
+#
+# Generate tests for each enabled RAJA back-end.
+#
+foreach( SORT_BACKEND ${SORT_BACKENDS} )
+  configure_file( test-algorithm-sort.cpp.in
+                  test-algorithm-sort-${SORT_BACKEND}.cpp )
+  raja_add_test( NAME test-algorithm-sort-${SORT_BACKEND}
+                 SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-algorithm-sort-${SORT_BACKEND}.cpp )
+
+  target_include_directories(test-algorithm-sort-${SORT_BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+endforeach()
+
+foreach( SORT_BACKEND ${SORT_BACKENDS} )
+  configure_file( test-algorithm-stable-sort.cpp.in
+                  test-algorithm-stable-sort-${SORT_BACKEND}.cpp )
+  raja_add_test( NAME test-algorithm-stable-sort-${SORT_BACKEND}
+                 SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-algorithm-stable-sort-${SORT_BACKEND}.cpp )
+
+  target_include_directories(test-algorithm-stable-sort-${SORT_BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+endforeach()
+
+
+set( SEQUENTIAL_UTIL_SORTS Shell Heap Intro Merge )
+set( CUDA_UTIL_SORTS       Shell Heap Intro )
+set( HIP_UTIL_SORTS        Shell Heap Intro )
+
+macro(RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS SORT_BACKEND_in SORT_SIZE_in UTIL_SORTS)
+  set( SORT_BACKEND ${SORT_BACKEND_in} )
+  set( SORT_SIZE ${SORT_SIZE_in} )
+  foreach( UTIL_SORT ${UTIL_SORTS} )
+    configure_file( test-algorithm-util-sort.cpp.in
+                    test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}.cpp )
+    raja_add_test( NAME test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}.cpp )
+
+    target_include_directories(test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}.exe
+                                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+  unset( SORT_SIZE )
+  unset( SORT_BACKEND )
+endmacro()
+
+
+RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Sequential Default "${SEQUENTIAL_UTIL_SORTS}" )
+RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Sequential Small "Insertion" )
+
+if(RAJA_ENABLE_CUDA)
+  RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Cuda Small "${CUDA_UTIL_SORTS}" )
+  RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Cuda Tiny "Insertion" )
+endif()
+
+if(RAJA_ENABLE_HIP)
+  RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Hip Small "${HIP_UTIL_SORTS}" )
+  RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Hip Tiny "Insertion" )
+endif()
+
+unset( SORT_BACKENDS )
+unset( SEQUENTIAL_UTIL_SORTS )
+unset( CUDA_UTIL_SORTS )
+unset( HIP_UTIL_SORTS )
diff --git a/test/unit/algorithm/test-algorithm-sort.cpp.in b/test/unit/algorithm/test-algorithm-sort.cpp.in
new file mode 100644
index 0000000000..6d8c3dac8a
--- /dev/null
+++ b/test/unit/algorithm/test-algorithm-sort.cpp.in
@@ -0,0 +1,36 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-algorithm-sort.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @SORT_BACKEND@SortTypes =
+  Test< camp::cartesian_product<@SORT_BACKEND@SortSorters,
+                                @SORT_BACKEND@ResourceList,
+                                SortKeyTypeList,
+                                SortMaxNListDefault > >::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P( @SORT_BACKEND@Test,
+                                SortUnitTest,
+                                @SORT_BACKEND@SortTypes );
diff --git a/test/unit/algorithm/test-algorithm-stable-sort.cpp.in b/test/unit/algorithm/test-algorithm-stable-sort.cpp.in
new file mode 100644
index 0000000000..78ee36322f
--- /dev/null
+++ b/test/unit/algorithm/test-algorithm-stable-sort.cpp.in
@@ -0,0 +1,36 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-algorithm-stable-sort.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @SORT_BACKEND@StableSortTypes =
+  Test< camp::cartesian_product<@SORT_BACKEND@StableSortSorters,
+                                @SORT_BACKEND@ResourceList,
+                                SortKeyTypeList,
+                                SortMaxNListDefault > >::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P( @SORT_BACKEND@Test,
+                                SortUnitTest,
+                                @SORT_BACKEND@StableSortTypes );
diff --git a/test/unit/algorithm/test-algorithm-util-sort.cpp.in b/test/unit/algorithm/test-algorithm-util-sort.cpp.in
new file mode 100644
index 0000000000..45b90675fe
--- /dev/null
+++ b/test/unit/algorithm/test-algorithm-util-sort.cpp.in
@@ -0,0 +1,36 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-algorithm-util-sort.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @SORT_BACKEND@@UTIL_SORT@SortTypes =
+  Test< camp::cartesian_product<@SORT_BACKEND@@UTIL_SORT@SortSorters,
+                                @SORT_BACKEND@ResourceList,
+                                SortKeyTypeList,
+                                SortMaxNList@SORT_SIZE@ > >::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P( @SORT_BACKEND@,
+                                SortUnitTest,
+                                @SORT_BACKEND@@UTIL_SORT@SortTypes );
diff --git a/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp b/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
new file mode 100644
index 0000000000..c7098f0d35
--- /dev/null
+++ b/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
@@ -0,0 +1,574 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing test infrastructure for sort tests
+///
+
+#ifndef __TEST_ALGORITHM_SORT_UTILS_HPP__
+#define __TEST_ALGORITHM_SORT_UTILS_HPP__
+
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-forall-data.hpp"
+#include "type_helper.hpp"
+#include "RAJA_unit-test-forone.hpp"
+
+#include <string>
+#include <list>
+#include <unordered_map>
+#include <unordered_set>
+#include <type_traits>
+#include <algorithm>
+#include <chrono>
+#include <random>
+
+
+// tag classes to differentiate sort by attributes and apply correct testing
+struct unstable_sort_tag { };
+struct stable_sort_tag { };
+
+struct sort_interface_tag { };
+struct sort_pairs_interface_tag { };
+
+struct sort_default_interface_tag { };
+struct sort_comp_interface_tag { };
+
+
+// synchronize based on a RAJA execution policy
+template < typename policy >
+struct PolicySynchronize
+{
+  void synchronize()
+  {
+    // no synchronization needed
+  }
+};
+
+#if defined(RAJA_ENABLE_CUDA)
+// partial specializatoin for cuda_exec
+template < size_t BLOCK_SIZE, bool Async >
+struct PolicySynchronize<RAJA::cuda_exec<BLOCK_SIZE, Async>>
+{
+  void synchronize()
+  {
+    if (Async) { RAJA::synchronize<RAJA::cuda_synchronize>(); }
+  }
+};
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+// partial specializatoin for hip_exec
+template < size_t BLOCK_SIZE, bool Async >
+struct PolicySynchronize<RAJA::hip_exec<BLOCK_SIZE, Async>>
+{
+  void synchronize()
+  {
+    if (Async) { RAJA::synchronize<RAJA::hip_synchronize>(); }
+  }
+};
+#endif
+
+
+template <typename pairs_category,
+          typename K,
+          typename V = RAJA::Index_type>
+struct SortData;
+
+template <typename K, typename V>
+struct SortData<sort_interface_tag, K, V>
+{
+  K* orig_keys = nullptr;
+  K* sorted_keys = nullptr;
+  camp::resources::Resource m_res;
+
+  template < typename RandomGenerator >
+  SortData(size_t N, camp::resources::Resource res, RandomGenerator gen_random)
+    : m_res(std::move(res))
+  {
+    if (N > 0) {
+      orig_keys = m_res.template allocate<K>(N);
+      sorted_keys = m_res.template allocate<K>(N);
+    }
+
+    for (size_t i = 0; i < N; i++) {
+      orig_keys[i] = gen_random();
+    }
+  }
+
+  void copy_data(size_t N)
+  {
+    if ( N == 0 ) return;
+    m_res.memcpy(sorted_keys, orig_keys, N*sizeof(K));
+  }
+
+  SortData(SortData const&) = delete;
+  SortData& operator=(SortData const&) = delete;
+
+  ~SortData()
+  {
+    if (orig_keys != nullptr) {
+      m_res.deallocate(orig_keys);
+      m_res.deallocate(sorted_keys);
+    }
+  }
+};
+
+
+template <typename K, typename V>
+struct SortData<sort_pairs_interface_tag, K, V> : SortData<sort_interface_tag, K, V>
+{
+  using base = SortData<sort_interface_tag, K, V>;
+
+  V* orig_vals = nullptr;
+  V* sorted_vals = nullptr;
+
+  template < typename RandomGenerator >
+  SortData(size_t N, camp::resources::Resource res, RandomGenerator gen_random)
+    : base(N, std::move(res), gen_random)
+  {
+    if (N > 0) {
+      orig_vals = this->m_res.template allocate<V>(N);
+      sorted_vals = this->m_res.template allocate<V>(N);
+    }
+
+    for (size_t i = 0; i < N; i++) {
+      orig_vals[i] = gen_random();
+    }
+  }
+
+  void copy_data(size_t N)
+  {
+    base::copy_data(N);
+    if ( N == 0 ) return;
+    this->m_res.memcpy(sorted_vals, orig_vals, N*sizeof(V));
+  }
+
+  SortData(SortData const&) = delete;
+  SortData& operator=(SortData const&) = delete;
+
+  ~SortData()
+  {
+    if (orig_vals != nullptr) {
+      this->m_res.deallocate(orig_vals);
+      this->m_res.deallocate(sorted_vals);
+    }
+  }
+};
+
+
+template <typename T,
+          typename Compare,
+          typename Sorter>
+void doSort(SortData<sort_interface_tag, T> & data,
+            RAJA::Index_type N,
+            Compare,
+            Sorter sorter, sort_interface_tag, sort_default_interface_tag)
+{
+  data.copy_data(N);
+  sorter(data.sorted_keys, data.sorted_keys+N);
+  sorter.synchronize();
+}
+
+template <typename T,
+          typename Compare,
+          typename Sorter>
+void doSort(SortData<sort_interface_tag, T> & data,
+            RAJA::Index_type N,
+            Compare comp,
+            Sorter sorter, sort_interface_tag, sort_comp_interface_tag)
+{
+  data.copy_data(N);
+  sorter(data.sorted_keys, data.sorted_keys+N, comp);
+  sorter.synchronize();
+}
+
+template <typename K,
+          typename V,
+          typename Compare,
+          typename Sorter>
+void doSort(SortData<sort_pairs_interface_tag, K, V> & data,
+            RAJA::Index_type N,
+            Compare,
+            Sorter sorter, sort_pairs_interface_tag, sort_default_interface_tag)
+{
+  data.copy_data(N);
+  sorter(data.sorted_keys, data.sorted_keys+N, data.sorted_vals);
+  sorter.synchronize();
+}
+
+template <typename K,
+          typename V,
+          typename Compare,
+          typename Sorter>
+void doSort(SortData<sort_pairs_interface_tag, K, V> & data,
+            RAJA::Index_type N,
+            Compare comp,
+            Sorter sorter, sort_pairs_interface_tag, sort_comp_interface_tag)
+{
+  data.copy_data(N);
+  sorter(data.sorted_keys, data.sorted_keys+N, data.sorted_vals, comp);
+  sorter.synchronize();
+}
+
+
+template <typename T,
+          typename Compare,
+          typename TestSorter,
+          typename CompareInterface>
+::testing::AssertionResult testSort(
+    const char* test_name,
+    const unsigned seed,
+    SortData<sort_interface_tag, T> & data,
+    RAJA::Index_type N,
+    Compare comp,
+    TestSorter test_sorter, unstable_sort_tag, sort_interface_tag si, CompareInterface ci)
+{
+  doSort(data, N, comp, test_sorter, si, ci);
+
+  // make map of keys to keys
+  using val_map = std::unordered_multiset<T>;
+  std::unordered_map<T, val_map> keys;
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    auto key_iter = keys.find(data.orig_keys[i]);
+    if (key_iter == keys.end()) {
+      auto ret = keys.emplace(data.orig_keys[i], val_map{});
+      assert(ret.second);
+      key_iter = ret.first;
+    }
+    key_iter->second.emplace(data.orig_keys[i]);
+  }
+
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    // test ordering
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
+      return ::testing::AssertionFailure()
+             << test_sorter.name() << " (unstable sort) " << test_name
+             << " (with N " << N << " with seed " << seed << ")"
+             << " out of order "
+             << data.sorted_keys[i-1] << ", " << data.sorted_keys[i]
+             << " (at index " << i-1 << ")";
+    // test there is an item with this
+    auto key_iter = keys.find(data.sorted_keys[i]);
+    if (key_iter == keys.end())
+      return ::testing::AssertionFailure()
+             << test_sorter.name() << " (unstable sort) " << test_name
+             << " (with N " << N << " with seed " << seed << ")"
+             << " unknown or duplicate key "
+             << data.sorted_keys[i]
+             << " (at index " << i << ")";
+    auto val_iter = key_iter->second.find(data.sorted_keys[i]);
+    if (val_iter == key_iter->second.end())
+      return ::testing::AssertionFailure()
+             << test_sorter.name() << " (unstable sort) " << test_name
+             << " (with N " << N << " with seed " << seed << ")"
+             << " unknown or duplicate val "
+             << data.sorted_keys[i]
+             << " (at index " << i << ")";
+    key_iter->second.erase(val_iter);
+    if (key_iter->second.size() == 0) {
+      keys.erase(key_iter);
+    }
+  }
+  return ::testing::AssertionSuccess();
+}
+
+template <typename T,
+          typename Compare,
+          typename TestSorter,
+          typename CompareInterface>
+::testing::AssertionResult testSort(
+    const char* test_name,
+    const unsigned seed,
+    SortData<sort_interface_tag, T> & data,
+    RAJA::Index_type N,
+    Compare comp,
+    TestSorter test_sorter, stable_sort_tag, sort_interface_tag si, CompareInterface ci)
+{
+  doSort(data, N, comp, test_sorter, si, ci);
+
+  // make map of keys to keys
+  using val_map = std::list<T>;
+  std::unordered_map<T, val_map> keys;
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    auto key_iter = keys.find(data.orig_keys[i]);
+    if (key_iter == keys.end()) {
+      auto ret = keys.emplace(data.orig_keys[i], val_map{});
+      assert(ret.second);
+      key_iter = ret.first;
+    }
+    key_iter->second.emplace_back(data.orig_keys[i]);
+  }
+
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    // test ordering
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
+      return ::testing::AssertionFailure()
+             << test_sorter.name() << " (stable sort) " << test_name
+             << " (with N " << N << " with seed " << seed << ")"
+             << " out of order "
+             << data.sorted_keys[i-1] << ", " << data.sorted_keys[i]
+             << " (at index " << i-1 << ")";
+    // test there is an item with this
+    auto key_iter = keys.find(data.sorted_keys[i]);
+    if (key_iter == keys.end())
+      return ::testing::AssertionFailure()
+             << test_sorter.name() << " (stable sort) " << test_name
+             << " (with N " << N << " with seed " << seed << ")"
+             << " unknown or duplicate key "
+             << data.sorted_keys[i]
+             << " (at index " << i << ")";
+    if (key_iter->second.front() != data.sorted_keys[i])
+      return ::testing::AssertionFailure()
+             << test_sorter.name() << " (stable sort) " << test_name
+             << " (with N " << N << " with seed " << seed << ")"
+             << " out of stable order or unknown val "
+             << data.sorted_keys[i]
+             << " (at index " << i << ")";
+    key_iter->second.pop_front();
+    if (key_iter->second.size() == 0) {
+      keys.erase(key_iter);
+    }
+  }
+  return ::testing::AssertionSuccess();
+}
+
+
+template <typename K,
+          typename V,
+          typename Compare,
+          typename TestSorter,
+          typename CompareInterface>
+::testing::AssertionResult testSort(
+    const char* test_name,
+    const unsigned seed,
+    SortData<sort_pairs_interface_tag, K, V> & data,
+    RAJA::Index_type N,
+    Compare comp,
+    TestSorter test_sorter, unstable_sort_tag, sort_pairs_interface_tag si, CompareInterface ci)
+{
+  doSort(data, N, comp, test_sorter, si, ci);
+
+  // make map of keys to vals
+  using val_map = std::unordered_multiset<V>;
+  std::unordered_map<K, val_map> keys_to_vals;
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    auto key_iter = keys_to_vals.find(data.orig_keys[i]);
+    if (key_iter == keys_to_vals.end()) {
+      auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map{});
+      assert(ret.second);
+      key_iter = ret.first;
+    }
+    key_iter->second.emplace(data.orig_vals[i]);
+  }
+
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    // test ordering
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
+      return ::testing::AssertionFailure()
+             << test_sorter.name() << " (unstable sort pairs) " << test_name
+             << " (with N " << N << " with seed " << seed << ")"
+             << " keys " << data.sorted_keys[i-1] << ", " << data.sorted_keys[i] << " out of order"
+             << " vals " << data.sorted_vals[i-1] << ", " << data.sorted_vals[i]
+             << " (at index " << i-1 << ")";
+    // test there is a pair with this key and val
+    auto key_iter = keys_to_vals.find(data.sorted_keys[i]);
+    if (key_iter == keys_to_vals.end())
+      return ::testing::AssertionFailure()
+             << test_sorter.name() << " (unstable sort pairs) " << test_name
+             << " (with N " << N << " with seed " << seed << ")"
+             << " unknown or duplicate key "
+             << " key " << data.sorted_keys[i]
+             << " val " << data.sorted_vals[i]
+             << " (at index " << i << ")";
+    auto val_iter = key_iter->second.find(data.sorted_vals[i]);
+    if (val_iter == key_iter->second.end())
+      return ::testing::AssertionFailure()
+             << test_sorter.name() << " (unstable sort pairs) " << test_name
+             << " (with N " << N << " with seed " << seed << ")"
+             << " unknown or duplicate val "
+             << " key " << data.sorted_keys[i]
+             << " val " << data.sorted_vals[i]
+             << " (at index " << i << ")";
+    key_iter->second.erase(val_iter);
+    if (key_iter->second.size() == 0) {
+      keys_to_vals.erase(key_iter);
+    }
+  }
+  return ::testing::AssertionSuccess();
+}
+
+template <typename K,
+          typename V,
+          typename Compare,
+          typename TestSorter,
+          typename CompareInterface>
+::testing::AssertionResult testSort(
+    const char* test_name,
+    const unsigned seed,
+    SortData<sort_pairs_interface_tag, K, V> & data,
+    RAJA::Index_type N,
+    Compare comp,
+    TestSorter test_sorter, stable_sort_tag, sort_pairs_interface_tag si, CompareInterface ci)
+{
+  doSort(data, N, comp, test_sorter, si, ci);
+
+  // make map of keys to vals
+  using val_map = std::list<V>;
+  std::unordered_map<K, val_map> keys_to_vals;
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    auto key_iter = keys_to_vals.find(data.orig_keys[i]);
+    if (key_iter == keys_to_vals.end()) {
+      auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map{});
+      assert(ret.second);
+      key_iter = ret.first;
+    }
+    key_iter->second.emplace_back(data.orig_vals[i]);
+  }
+
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    // test ordering
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
+      return ::testing::AssertionFailure()
+             << test_sorter.name() << " (stable sort pairs) " << test_name
+             << " (with N " << N << " with seed " << seed << ")"
+             << " out of order "
+             << " keys " << data.sorted_keys[i-1] << ", " << data.sorted_keys[i]
+             << " vals " << data.sorted_vals[i-1] << ", " << data.sorted_vals[i]
+             << " (at index " << i-1 << ")";
+    // test there is a pair with this key and val
+    auto key_iter = keys_to_vals.find(data.sorted_keys[i]);
+    if (key_iter == keys_to_vals.end())
+      return ::testing::AssertionFailure()
+             << test_sorter.name() << " (stable sort pairs) " << test_name
+             << " (with N " << N << " with seed " << seed << ")"
+             << " unknown or duplicate key "
+             << " key " << data.sorted_keys[i]
+             << " val " << data.sorted_vals[i]
+             << " (at index " << i << ")";
+    if (key_iter->second.front() != data.sorted_vals[i])
+      return ::testing::AssertionFailure()
+             << test_sorter.name() << " (stable sort pairs) " << test_name
+             << " (with N " << N << " with seed " << seed << ")"
+             << " out of stable order or unknown val "
+             << " key " << data.sorted_keys[i]
+             << " val " << data.sorted_vals[i]
+             << " (at index " << i << ")";
+    key_iter->second.pop_front();
+    if (key_iter->second.size() == 0) {
+      keys_to_vals.erase(key_iter);
+    }
+  }
+  return ::testing::AssertionSuccess();
+}
+
+template <typename K,
+          typename Sorter>
+void testSorterInterfaces(unsigned seed, RAJA::Index_type MaxN, Sorter sorter, camp::resources::Resource res)
+{
+  using stability_category = typename Sorter::sort_category ;
+  using pairs_category     = typename Sorter::sort_interface ;
+  using no_comparator      = sort_default_interface_tag;
+  using use_comparator     = sort_comp_interface_tag;
+
+  std::mt19937 rng(seed);
+  RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>((MaxN+1)/2, MaxN)(rng);
+  std::uniform_int_distribution<RAJA::Index_type> dist(-N, N);
+
+  SortData<pairs_category, K> data(N, res, [&](){ return dist(rng); });
+
+  ASSERT_TRUE(testSort("default", seed, data, N, RAJA::operators::less<K>{},
+      sorter, stability_category{}, pairs_category{}, no_comparator{}));
+  ASSERT_TRUE(testSort("ascending", seed, data, N, RAJA::operators::less<K>{},
+      sorter, stability_category{}, pairs_category{}, use_comparator{}));
+  ASSERT_TRUE(testSort("descending", seed, data, N, RAJA::operators::greater<K>{},
+      sorter, stability_category{}, pairs_category{}, use_comparator{}));
+}
+
+template <typename K,
+          typename Sorter>
+void testSorter(unsigned seed, RAJA::Index_type MaxN, Sorter sorter, camp::resources::Resource res)
+{
+  testSorterInterfaces<K>(seed, 0, sorter, res);
+  for (RAJA::Index_type n = 1; n <= MaxN; n *= 10) {
+    testSorterInterfaces<K>(seed, n, sorter, res);
+  }
+}
+
+inline unsigned get_random_seed()
+{
+  static unsigned seed = std::random_device{}();
+  return seed;
+}
+
+
+TYPED_TEST_SUITE_P(SortUnitTest);
+
+template < typename T >
+class SortUnitTest : public ::testing::Test
+{ };
+
+TYPED_TEST_P(SortUnitTest, UnitSort)
+{
+  using Sorter   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using KeyType  = typename camp::at<TypeParam, camp::num<2>>::type;
+  using MaxNType = typename camp::at<TypeParam, camp::num<3>>::type;
+
+  unsigned seed = get_random_seed();
+  RAJA::Index_type MaxN = MaxNType::value;
+  Sorter sorter{};
+  camp::resources::Resource res{ResType()};
+
+  testSorter<KeyType>(seed, MaxN, sorter, res);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(SortUnitTest, UnitSort);
+
+
+//
+// Key types for sort tests
+//
+using SortKeyTypeList =
+  camp::list<
+              RAJA::Index_type,
+              int,
+#if defined(RAJA_TEST_EXHAUSTIVE)
+              unsigned,
+              long long,
+              unsigned long long,
+              float,
+#endif
+              double
+            >;
+
+// Max test lengths for sort tests
+using SortMaxNListDefault =
+  camp::list<
+              camp::num<10000>
+            >;
+
+using SortMaxNListSmall =
+  camp::list<
+              camp::num<1000>
+            >;
+
+using SortMaxNListTiny =
+  camp::list<
+              camp::num<100>
+            >;
+
+#endif //__TEST_ALGORITHM_SORT_UTILS_HPP__
+
diff --git a/test/unit/algorithm/tests/test-algorithm-sort.hpp b/test/unit/algorithm/tests/test-algorithm-sort.hpp
new file mode 100644
index 0000000000..cc0b871230
--- /dev/null
+++ b/test/unit/algorithm/tests/test-algorithm-sort.hpp
@@ -0,0 +1,133 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing Sorter classes for sort tests
+///
+
+#ifndef __TEST_UNIT_ALGORITHM_SORT_HPP__
+#define __TEST_UNIT_ALGORITHM_SORT_HPP__
+
+#include "test-algorithm-sort-utils.hpp"
+
+template < typename policy >
+struct PolicySort
+  : PolicySynchronize<policy>
+{
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_interface_tag;
+
+  std::string m_name;
+
+  PolicySort()
+    : m_name("RAJA::sort<unknown>")
+  { }
+
+  PolicySort(std::string const& policy_name)
+    : m_name(std::string("RAJA::sort<") + policy_name + std::string(">"))
+  { }
+
+  const char* name()
+  {
+    return m_name.c_str();
+  }
+
+  template < typename... Args >
+  void operator()(Args&&... args)
+  {
+    RAJA::sort<policy>(std::forward<Args>(args)...);
+  }
+};
+
+template < typename policy >
+struct PolicySortPairs
+  : PolicySynchronize<policy>
+{
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
+
+  std::string m_name;
+
+  PolicySortPairs()
+    : m_name("RAJA::sort<unknown>[pairs]")
+  { }
+
+  PolicySortPairs(std::string const& policy_name)
+    : m_name(std::string("RAJA::sort<") + policy_name + std::string(">[pairs]"))
+  { }
+
+  const char* name()
+  {
+    return m_name.c_str();
+  }
+
+  template < typename... Args >
+  void operator()(Args&&... args)
+  {
+    RAJA::sort_pairs<policy>(std::forward<Args>(args)...);
+  }
+};
+
+
+using SequentialSortSorters =
+  camp::list<
+              PolicySort<RAJA::loop_exec>,
+              PolicySortPairs<RAJA::loop_exec>,
+              PolicySort<RAJA::seq_exec>,
+              PolicySortPairs<RAJA::seq_exec>
+            >;
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+using OpenMPSortSorters =
+  camp::list<
+              PolicySort<RAJA::omp_parallel_for_exec>,
+              PolicySortPairs<RAJA::omp_parallel_for_exec>
+            >;
+
+#endif
+
+#if defined(RAJA_ENABLE_TBB)
+
+using TBBSortSorters =
+  camp::list<
+              PolicySort<RAJA::tbb_for_exec>,
+              PolicySortPairs<RAJA::tbb_for_exec>
+            >;
+
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+
+using CudaSortSorters =
+  camp::list<
+              PolicySort<RAJA::cuda_exec<128>>,
+              PolicySortPairs<RAJA::cuda_exec<128>>
+            >;
+
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+
+using HipSortSorters =
+  camp::list<
+              PolicySort<RAJA::hip_exec<128>>,
+              PolicySortPairs<RAJA::hip_exec<128>>
+            >;
+
+#endif
+
+#endif //__TEST_UNIT_ALGORITHM_SORT_HPP__
+
diff --git a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
new file mode 100644
index 0000000000..9372e29c7d
--- /dev/null
+++ b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
@@ -0,0 +1,132 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing Sorter classes for stable sort tests
+///
+
+#ifndef __TEST_UNIT_ALGORITHM_STABLE_SORT_HPP__
+#define __TEST_UNIT_ALGORITHM_STABLE_SORT_HPP__
+
+#include "test-algorithm-sort-utils.hpp"
+
+
+template < typename policy >
+struct PolicyStableSort
+  : PolicySynchronize<policy>
+{
+  using sort_category = stable_sort_tag;
+  using sort_interface = sort_interface_tag;
+
+  std::string m_name;
+
+  PolicyStableSort()
+    : m_name("RAJA::stable_sort<unknown>")
+  { }
+
+  PolicyStableSort(std::string const& policy_name)
+    : m_name(std::string("RAJA::stable_sort<") + policy_name + std::string(">"))
+  { }
+
+  const char* name()
+  {
+    return m_name.c_str();
+  }
+
+  template < typename... Args >
+  void operator()(Args&&... args)
+  {
+    RAJA::stable_sort<policy>(std::forward<Args>(args)...);
+  }
+};
+
+template < typename policy >
+struct PolicyStableSortPairs
+  : PolicySynchronize<policy>
+{
+  using sort_category = stable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
+
+  std::string m_name;
+
+  PolicyStableSortPairs()
+    : m_name("RAJA::stable_sort<unknown>[pairs]")
+  { }
+
+  PolicyStableSortPairs(std::string const& policy_name)
+    : m_name(std::string("RAJA::stable_sort<") + policy_name + std::string(">[pairs]"))
+  { }
+
+  const char* name()
+  {
+    return m_name.c_str();
+  }
+
+  template < typename... Args >
+  void operator()(Args&&... args)
+  {
+    RAJA::stable_sort_pairs<policy>(std::forward<Args>(args)...);
+  }
+};
+
+using SequentialStableSortSorters =
+  camp::list<
+              PolicyStableSort<RAJA::loop_exec>,
+              PolicyStableSortPairs<RAJA::loop_exec>,
+              PolicyStableSort<RAJA::seq_exec>,
+              PolicyStableSortPairs<RAJA::seq_exec>
+            >;
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+using OpenMPStableSortSorters =
+  camp::list<
+              PolicyStableSort<RAJA::omp_parallel_for_exec>,
+              PolicyStableSortPairs<RAJA::omp_parallel_for_exec>
+            >;
+
+#endif
+
+#if defined(RAJA_ENABLE_TBB)
+
+using TBBStableSortSorters =
+  camp::list<
+              PolicyStableSort<RAJA::tbb_for_exec>,
+              PolicyStableSortPairs<RAJA::tbb_for_exec>
+            >;
+
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+
+using CudaStableSortSorters =
+  camp::list<
+              PolicyStableSort<RAJA::cuda_exec<128>>,
+              PolicyStableSortPairs<RAJA::cuda_exec<128>>
+            >;
+
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+
+using HipStableSortSorters =
+  camp::list<
+              PolicyStableSort<RAJA::hip_exec<128>>,
+              PolicyStableSortPairs<RAJA::hip_exec<128>>
+            >;
+
+#endif
+
+#endif // __TEST_UNIT_ALGORITHM_STABLE_SORT_HPP__
diff --git a/test/unit/algorithm/tests/test-algorithm-util-sort.hpp b/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
new file mode 100644
index 0000000000..9d4e3b4130
--- /dev/null
+++ b/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
@@ -0,0 +1,805 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing Sorter classes for util sort tests
+///
+
+#ifndef __TEST_ALGORITHM_UTIL_SORT_HPP__
+#define __TEST_ALGORITHM_UTIL_SORT_HPP__
+
+#include "test-algorithm-sort-utils.hpp"
+
+
+template < typename forone_policy >
+using ForoneSynchronize = PolicySynchronize<forone_equivalent_exec_policy<forone_policy>>;
+
+
+template < typename forone_policy, typename platform = forone_platform<forone_policy> >
+struct InsertionSort;
+
+template < typename forone_policy, typename platform = forone_platform<forone_policy> >
+struct InsertionSortPairs;
+
+template < typename forone_policy, typename platform = forone_platform<forone_policy> >
+struct ShellSort;
+
+template < typename forone_policy, typename platform = forone_platform<forone_policy> >
+struct ShellSortPairs;
+
+template < typename forone_policy, typename platform = forone_platform<forone_policy> >
+struct HeapSort;
+
+template < typename forone_policy, typename platform = forone_platform<forone_policy> >
+struct HeapSortPairs;
+
+template < typename forone_policy, typename platform = forone_platform<forone_policy> >
+struct IntroSort;
+
+template < typename forone_policy, typename platform = forone_platform<forone_policy> >
+struct IntroSortPairs;
+
+template < typename forone_policy, typename platform = forone_platform<forone_policy> >
+struct MergeSort;
+
+template < typename forone_policy, typename platform = forone_platform<forone_policy> >
+struct MergeSortPairs;
+
+
+template < typename forone_policy >
+struct InsertionSort<forone_policy, RunOnHost>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = stable_sort_tag;
+  using sort_interface = sort_interface_tag;
+
+  const char* name()
+  {
+    return "RAJA::insertion_sort";
+  }
+
+  template < typename... Args >
+  void operator()(Args&&... args)
+  {
+    RAJA::insertion_sort(std::forward<Args>(args)...);
+  }
+};
+
+template < typename forone_policy >
+struct InsertionSortPairs<forone_policy, RunOnHost>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = stable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
+
+  const char* name()
+  {
+    return "RAJA::insertion_sort[pairs]";
+  }
+
+  template < typename KeyIter, typename ValIter >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin)
+  {
+    auto begin = RAJA::zip(keys_begin, vals_begin);
+    auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+    using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+    RAJA::operators::less<RAJA::detail::IterRef<KeyIter>> comp{};
+    RAJA::insertion_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+  }
+
+  template < typename KeyIter, typename ValIter, typename Compare >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, Compare comp)
+  {
+    auto begin = RAJA::zip(keys_begin, vals_begin);
+    auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+    using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+    RAJA::insertion_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+  }
+};
+
+template < typename forone_policy >
+struct ShellSort<forone_policy, RunOnHost>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_interface_tag;
+
+  const char* name()
+  {
+    return "RAJA::shell_sort";
+  }
+
+  template < typename... Args >
+  void operator()(Args&&... args)
+  {
+    RAJA::shell_sort(std::forward<Args>(args)...);
+  }
+};
+
+template < typename forone_policy >
+struct ShellSortPairs<forone_policy, RunOnHost>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
+
+  const char* name()
+  {
+    return "RAJA::shell_sort[pairs]";
+  }
+
+  template < typename KeyIter, typename ValIter >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin)
+  {
+    auto begin = RAJA::zip(keys_begin, vals_begin);
+    auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+    using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+    RAJA::operators::less<RAJA::detail::IterRef<KeyIter>> comp{};
+    RAJA::shell_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+  }
+
+  template < typename KeyIter, typename ValIter, typename Compare >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, Compare comp)
+  {
+    auto begin = RAJA::zip(keys_begin, vals_begin);
+    auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+    using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+    RAJA::shell_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+  }
+};
+
+template < typename forone_policy >
+struct HeapSort<forone_policy, RunOnHost>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_interface_tag;
+
+  const char* name()
+  {
+    return "RAJA::heap_sort";
+  }
+
+  template < typename... Args >
+  void operator()(Args&&... args)
+  {
+    RAJA::heap_sort(std::forward<Args>(args)...);
+  }
+};
+
+template < typename forone_policy >
+struct HeapSortPairs<forone_policy, RunOnHost>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
+
+  const char* name()
+  {
+    return "RAJA::heap_sort[pairs]";
+  }
+
+  template < typename KeyIter, typename ValIter >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin)
+  {
+    auto begin = RAJA::zip(keys_begin, vals_begin);
+    auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+    using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+    RAJA::operators::less<RAJA::detail::IterRef<KeyIter>> comp{};
+    RAJA::heap_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+  }
+
+  template < typename KeyIter, typename ValIter, typename Compare >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, Compare comp)
+  {
+    auto begin = RAJA::zip(keys_begin, vals_begin);
+    auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+    using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+    RAJA::heap_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+  }
+};
+
+template < typename forone_policy >
+struct IntroSort<forone_policy, RunOnHost>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_interface_tag;
+
+  const char* name()
+  {
+    return "RAJA::intro_sort";
+  }
+
+  template < typename... Args >
+  void operator()(Args&&... args)
+  {
+    RAJA::intro_sort(std::forward<Args>(args)...);
+  }
+};
+
+template < typename forone_policy >
+struct IntroSortPairs<forone_policy, RunOnHost>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
+
+  const char* name()
+  {
+    return "RAJA::intro_sort[pairs]";
+  }
+
+  template < typename KeyIter, typename ValIter >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin)
+  {
+    auto begin = RAJA::zip(keys_begin, vals_begin);
+    auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+    using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+    RAJA::operators::less<RAJA::detail::IterRef<KeyIter>> comp{};
+    RAJA::intro_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+  }
+
+  template < typename KeyIter, typename ValIter, typename Compare >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, Compare comp)
+  {
+    auto begin = RAJA::zip(keys_begin, vals_begin);
+    auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+    using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+    RAJA::intro_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+  }
+};
+
+template < typename forone_policy >
+struct MergeSort<forone_policy, RunOnHost>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = stable_sort_tag;
+  using sort_interface = sort_interface_tag;
+
+  const char* name()
+  {
+    return "RAJA::merge_sort";
+  }
+
+  template < typename... Args >
+  void operator()(Args&&... args)
+  {
+    RAJA::merge_sort(std::forward<Args>(args)...);
+  }
+};
+
+template < typename forone_policy >
+struct MergeSortPairs<forone_policy, RunOnHost>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = stable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
+
+  const char* name()
+  {
+    return "RAJA::merge_sort[pairs]";
+  }
+
+  template < typename KeyIter, typename ValIter >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin)
+  {
+    auto begin = RAJA::zip(keys_begin, vals_begin);
+    auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+    using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+    RAJA::operators::less<RAJA::detail::IterRef<KeyIter>> comp{};
+    RAJA::merge_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+  }
+
+  template < typename KeyIter, typename ValIter, typename Compare >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, Compare comp)
+  {
+    auto begin = RAJA::zip(keys_begin, vals_begin);
+    auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+    using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+    RAJA::merge_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+  }
+};
+
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+
+template < typename forone_policy >
+struct InsertionSort<forone_policy, RunOnDevice>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = stable_sort_tag;
+  using sort_interface = sort_interface_tag;
+
+  std::string m_name;
+
+  InsertionSort()
+    : m_name(std::string("RAJA::insertion_sort<") + forone_policy_info<forone_policy>::name() + std::string(">"))
+  { }
+
+  const char* name()
+  {
+    return m_name.c_str();
+  }
+
+  template < typename Iter >
+  void operator()(Iter begin, Iter end)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      RAJA::insertion_sort(begin, end);
+    });
+  }
+
+  template < typename Iter, typename Compare >
+  void operator()(Iter begin, Iter end, Compare comp)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      RAJA::insertion_sort(begin, end, comp);
+    });
+  }
+};
+
+template < typename forone_policy >
+struct InsertionSortPairs<forone_policy, RunOnDevice>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = stable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
+
+  std::string m_name;
+
+  InsertionSortPairs()
+    : m_name(std::string("RAJA::insertion_sort<") + forone_policy_info<forone_policy>::name() + std::string(">[pairs]"))
+  { }
+
+  const char* name()
+  {
+    return m_name.c_str();
+  }
+
+  template < typename KeyIter, typename ValIter >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      auto begin = RAJA::zip(keys_begin, vals_begin);
+      auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+      using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+      RAJA::operators::less<RAJA::detail::IterRef<KeyIter>> comp{};
+      RAJA::insertion_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+    });
+  }
+
+  template < typename KeyIter, typename ValIter, typename Compare >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, Compare comp)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      auto begin = RAJA::zip(keys_begin, vals_begin);
+      auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+      using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+      RAJA::insertion_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+    });
+  }
+};
+
+template < typename forone_policy >
+struct ShellSort<forone_policy, RunOnDevice>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_interface_tag;
+
+  std::string m_name;
+
+  ShellSort()
+    : m_name(std::string("RAJA::shell_sort<") + forone_policy_info<forone_policy>::name() + std::string(">"))
+  { }
+
+  const char* name()
+  {
+    return m_name.c_str();
+  }
+
+  template < typename Iter >
+  void operator()(Iter begin, Iter end)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      RAJA::shell_sort(begin, end);
+    });
+  }
+
+  template < typename Iter, typename Compare >
+  void operator()(Iter begin, Iter end, Compare comp)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      RAJA::shell_sort(begin, end, comp);
+    });
+  }
+};
+
+template < typename forone_policy >
+struct ShellSortPairs<forone_policy, RunOnDevice>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
+
+  std::string m_name;
+
+  ShellSortPairs()
+    : m_name(std::string("RAJA::shell_sort<") + forone_policy_info<forone_policy>::name() + std::string(">[pairs]"))
+  { }
+
+  const char* name()
+  {
+    return m_name.c_str();
+  }
+
+  template < typename KeyIter, typename ValIter >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      auto begin = RAJA::zip(keys_begin, vals_begin);
+      auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+      using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+      RAJA::operators::less<RAJA::detail::IterRef<KeyIter>> comp{};
+    RAJA::shell_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+    });
+  }
+
+  template < typename KeyIter, typename ValIter, typename Compare >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, Compare comp)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      auto begin = RAJA::zip(keys_begin, vals_begin);
+      auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+      using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+      RAJA::shell_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+    });
+  }
+};
+
+template < typename forone_policy >
+struct HeapSort<forone_policy, RunOnDevice>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_interface_tag;
+
+  std::string m_name;
+
+  HeapSort()
+    : m_name(std::string("RAJA::heap_sort<") + forone_policy_info<forone_policy>::name() + std::string(">"))
+  { }
+
+  const char* name()
+  {
+    return m_name.c_str();
+  }
+
+  template < typename Iter >
+  void operator()(Iter begin, Iter end)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      RAJA::heap_sort(begin, end);
+    });
+  }
+
+  template < typename Iter, typename Compare >
+  void operator()(Iter begin, Iter end, Compare comp)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      RAJA::heap_sort(begin, end, comp);
+    });
+  }
+};
+
+template < typename forone_policy >
+struct HeapSortPairs<forone_policy, RunOnDevice>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
+
+  std::string m_name;
+
+  HeapSortPairs()
+    : m_name(std::string("RAJA::heap_sort<") + forone_policy_info<forone_policy>::name() + std::string(">[pairs]"))
+  { }
+
+  const char* name()
+  {
+    return m_name.c_str();
+  }
+
+  template < typename KeyIter, typename ValIter >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      auto begin = RAJA::zip(keys_begin, vals_begin);
+      auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+      using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+      RAJA::operators::less<RAJA::detail::IterRef<KeyIter>> comp{};
+      RAJA::heap_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+    });
+  }
+
+  template < typename KeyIter, typename ValIter, typename Compare >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, Compare comp)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      auto begin = RAJA::zip(keys_begin, vals_begin);
+      auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+      using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+      RAJA::heap_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+    });
+  }
+};
+
+template < typename forone_policy >
+struct IntroSort<forone_policy, RunOnDevice>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_interface_tag;
+
+  std::string m_name;
+
+  IntroSort()
+    : m_name(std::string("RAJA::intro_sort<") + forone_policy_info<forone_policy>::name() + std::string(">"))
+  { }
+
+  const char* name()
+  {
+    return m_name.c_str();
+  }
+
+  template < typename Iter >
+  void operator()(Iter begin, Iter end)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      RAJA::intro_sort(begin, end);
+    });
+  }
+
+  template < typename Iter, typename Compare >
+  void operator()(Iter begin, Iter end, Compare comp)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      RAJA::intro_sort(begin, end, comp);
+    });
+  }
+};
+
+template < typename forone_policy >
+struct IntroSortPairs<forone_policy, RunOnDevice>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
+
+  std::string m_name;
+
+  IntroSortPairs()
+    : m_name(std::string("RAJA::intro_sort<") + forone_policy_info<forone_policy>::name() + std::string(">[pairs]"))
+  { }
+
+  const char* name()
+  {
+    return m_name.c_str();
+  }
+
+  template < typename KeyIter, typename ValIter >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      auto begin = RAJA::zip(keys_begin, vals_begin);
+      auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+      using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+      RAJA::operators::less<RAJA::detail::IterRef<KeyIter>> comp{};
+      RAJA::intro_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+    });
+  }
+
+  template < typename KeyIter, typename ValIter, typename Compare >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, Compare comp)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      auto begin = RAJA::zip(keys_begin, vals_begin);
+      auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+      using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+      RAJA::intro_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+    });
+  }
+};
+
+template < typename forone_policy >
+struct MergeSort<forone_policy, RunOnDevice>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_interface_tag;
+
+  std::string m_name;
+
+  MergeSort()
+    : m_name(std::string("RAJA::merge_sort<") + forone_policy_info<forone_policy>::name() + std::string(">"))
+  { }
+
+  const char* name()
+  {
+    return m_name.c_str();
+  }
+
+  template < typename Iter >
+  void operator()(Iter begin, Iter end)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      RAJA::merge_sort(begin, end);
+    });
+  }
+
+  template < typename Iter, typename Compare >
+  void operator()(Iter begin, Iter end, Compare comp)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      RAJA::merge_sort(begin, end, comp);
+    });
+  }
+};
+
+template < typename forone_policy >
+struct MergeSortPairs<forone_policy, RunOnDevice>
+  : ForoneSynchronize<forone_policy>
+{
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
+
+  std::string m_name;
+
+  MergeSortPairs()
+    : m_name(std::string("RAJA::merge_sort<") + forone_policy_info<forone_policy>::name() + std::string(">[pairs]"))
+  { }
+
+  const char* name()
+  {
+    return m_name.c_str();
+  }
+
+  template < typename KeyIter, typename ValIter >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      auto begin = RAJA::zip(keys_begin, vals_begin);
+      auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+      using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+      RAJA::operators::less<RAJA::detail::IterRef<KeyIter>> comp{};
+      RAJA::merge_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+    });
+  }
+
+  template < typename KeyIter, typename ValIter, typename Compare >
+  void operator()(KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, Compare comp)
+  {
+    forone<forone_policy>( [=] RAJA_DEVICE() {
+      auto begin = RAJA::zip(keys_begin, vals_begin);
+      auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+      using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
+      RAJA::merge_sort(begin, end, RAJA::compare_first<zip_ref>(comp));
+    });
+  }
+};
+
+#endif
+
+
+using SequentialInsertionSortSorters =
+  camp::list<
+              InsertionSort<forone_seq>,
+              InsertionSortPairs<forone_seq>
+            >;
+
+using SequentialShellSortSorters =
+  camp::list<
+              ShellSort<forone_seq>,
+              ShellSortPairs<forone_seq>
+            >;
+
+using SequentialHeapSortSorters =
+  camp::list<
+              HeapSort<forone_seq>,
+              HeapSortPairs<forone_seq>
+            >;
+
+using SequentialIntroSortSorters =
+  camp::list<
+              IntroSort<forone_seq>,
+              IntroSortPairs<forone_seq>
+            >;
+
+using SequentialMergeSortSorters =
+  camp::list<
+              MergeSort<forone_seq>,
+              MergeSortPairs<forone_seq>
+            >;
+
+#if defined(RAJA_ENABLE_CUDA)
+
+using CudaInsertionSortSorters =
+  camp::list<
+              InsertionSort<forone_cuda>,
+              InsertionSortPairs<forone_cuda>
+            >;
+
+using CudaShellSortSorters =
+  camp::list<
+              ShellSort<forone_cuda>,
+              ShellSortPairs<forone_cuda>
+            >;
+
+using CudaHeapSortSorters =
+  camp::list<
+              HeapSort<forone_cuda>,
+              HeapSortPairs<forone_cuda>
+            >;
+
+using CudaIntroSortSorters =
+  camp::list<
+              IntroSort<forone_cuda>,
+              IntroSortPairs<forone_cuda>
+            >;
+
+using CudaMergeSortSorters =
+  camp::list<
+              MergeSort<forone_cuda>,
+              MergeSortPairs<forone_cuda>
+            >;
+
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+
+using HipInsertionSortSorters =
+  camp::list<
+              InsertionSort<forone_hip>,
+              InsertionSortPairs<forone_hip>
+            >;
+
+using HipShellSortSorters =
+  camp::list<
+              ShellSort<forone_hip>,
+              ShellSortPairs<forone_hip>
+            >;
+
+using HipHeapSortSorters =
+  camp::list<
+              HeapSort<forone_hip>,
+              HeapSortPairs<forone_hip>
+            >;
+
+using HipIntroSortSorters =
+  camp::list<
+              IntroSort<forone_hip>,
+              IntroSortPairs<forone_hip>
+            >;
+
+using HipMergeSortSorters =
+  camp::list<
+              MergeSort<forone_hip>,
+              MergeSortPairs<forone_hip>
+            >;
+
+#endif
+
+#endif //__TEST_ALGORITHM_UTIL_SORT_HPP__
+
diff --git a/test/unit/atomic/CMakeLists.txt b/test/unit/atomic/CMakeLists.txt
new file mode 100644
index 0000000000..7a3514526e
--- /dev/null
+++ b/test/unit/atomic/CMakeLists.txt
@@ -0,0 +1,30 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+raja_add_test(
+  NAME test-atomic-ref-constructor
+  SOURCES test-atomic-ref-constructor.cpp)
+
+raja_add_test(
+  NAME test-atomic-ref-accessors
+  SOURCES test-atomic-ref-accessors.cpp)
+
+raja_add_test(
+  NAME test-atomic-ref-exchanges
+  SOURCES test-atomic-ref-exchanges.cpp)
+
+raja_add_test(
+  NAME test-atomic-ref-addsub
+  SOURCES test-atomic-ref-addsub.cpp)
+
+raja_add_test(
+  NAME test-atomic-ref-minmax
+  SOURCES test-atomic-ref-minmax.cpp)
+
+raja_add_test(
+  NAME test-atomic-ref-bitwise
+  SOURCES test-atomic-ref-bitwise.cpp)
diff --git a/test/unit/atomic/test-atomic-ref-accessors.cpp b/test/unit/atomic/test-atomic-ref-accessors.cpp
new file mode 100644
index 0000000000..ef0381d7e4
--- /dev/null
+++ b/test/unit/atomic/test-atomic-ref-accessors.cpp
@@ -0,0 +1,137 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for atomic accessor methods
+///
+
+#include "RAJA/RAJA.hpp"
+
+#include "RAJA_gtest.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+#include "RAJA_unit-test-forone.hpp"
+#endif
+
+#include "test-atomic-ref.hpp"
+
+// Basic Accessors
+
+template <typename T>
+class AtomicRefBasicAccessorUnitTest : public ::testing::Test
+{};
+
+TYPED_TEST_SUITE_P( AtomicRefBasicAccessorUnitTest );
+
+TYPED_TEST_P( AtomicRefBasicAccessorUnitTest, BasicAccessors )
+{
+  using T = typename std::tuple_element<0, TypeParam>::type;
+  using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
+
+  // should also work with CUDA
+  T theval = (T)0;
+  T * memaddr = &theval;
+  T result;
+
+  // explicit constructor with memory address
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+
+  // test store method with op()
+  test1.store( (T)19 );
+  ASSERT_EQ( test1, (T)19 );
+
+  // test assignment operator
+  test1 = (T)23;
+  ASSERT_EQ( test1, (T)23 );
+
+  // test load method
+  test1 = (T)29;
+  ASSERT_EQ( test1.load(), (T)29 );
+
+  // test ()
+  result = (test1 = (T)31);
+  ASSERT_EQ( test1, (T)31 );
+  ASSERT_EQ( result, (T)31 );
+}
+
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicAccessorUnitTest,
+                             BasicAccessors
+                           );
+
+INSTANTIATE_TYPED_TEST_SUITE_P( BasicAccessUnitTest,
+                                AtomicRefBasicAccessorUnitTest,
+                                basic_types
+                              );
+
+// Pure CUDA test.
+#if defined(RAJA_ENABLE_CUDA)
+// CUDA Accessors
+
+template <typename T>
+class AtomicRefCUDAAccessorUnitTest : public ::testing::Test
+{};
+
+TYPED_TEST_SUITE_P( AtomicRefCUDAAccessorUnitTest );
+
+GPU_TYPED_TEST_P( AtomicRefCUDAAccessorUnitTest, CUDAAccessors )
+{
+  using T = typename std::tuple_element<0, TypeParam>::type;
+  using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
+
+  T * memaddr = nullptr;
+  T * result = nullptr;
+  cudaErrchk(cudaMallocManaged((void **)&memaddr, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void **)&result, sizeof(T)));
+  cudaErrchk(cudaDeviceSynchronize());
+
+  // explicit constructor with memory address
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+
+  // test store method with op()
+  forone<forone_cuda>( [=] __device__ () {test1.store( (T)19 );} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( test1, (T)19 );
+
+  // test assignment operator
+  forone<forone_cuda>( [=] __device__ () {test1 = (T)23;} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( test1, (T)23 );
+
+  // test load method
+  forone<forone_cuda>( [=] __device__ () {test1 = (T)29; result[0] = test1.load();} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( result[0], (T)29 );
+  ASSERT_EQ( test1, (T)29 );
+
+  // test T()
+  forone<forone_cuda>( [=] __device__ () {test1 = (T)47; result[0] = test1;} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( result[0], (T)47 );
+  ASSERT_EQ( test1, (T)47 );
+
+  forone<forone_cuda>( [=] __device__ () {result[0] = (test1 = (T)31);} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( result[0], (T)31 );
+  ASSERT_EQ( test1, (T)31 );
+
+  cudaErrchk(cudaDeviceSynchronize());
+
+  cudaErrchk(cudaFree(memaddr));
+  cudaErrchk(cudaFree(result));
+}
+
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAAccessorUnitTest,
+                             CUDAAccessors
+                           );
+
+INSTANTIATE_TYPED_TEST_SUITE_P( CUDAAccessUnitTest,
+                                AtomicRefCUDAAccessorUnitTest,
+                                CUDA_types
+                              );
+#endif
+
+
diff --git a/test/unit/atomic/test-atomic-ref-addsub.cpp b/test/unit/atomic/test-atomic-ref-addsub.cpp
new file mode 100644
index 0000000000..ef6db84b46
--- /dev/null
+++ b/test/unit/atomic/test-atomic-ref-addsub.cpp
@@ -0,0 +1,162 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for atomic add, subtract, inc, and dec methods
+///
+
+#include "RAJA/RAJA.hpp"
+
+#include "RAJA_gtest.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+#include "RAJA_unit-test-forone.hpp"
+#endif
+
+#include "test-atomic-ref.hpp"
+
+// Basic AddSub
+
+template <typename T>
+class AtomicRefBasicAddSubUnitTest : public ::testing::Test
+{};
+
+TYPED_TEST_SUITE_P( AtomicRefBasicAddSubUnitTest );
+
+TYPED_TEST_P( AtomicRefBasicAddSubUnitTest, BasicAddSubs )
+{
+  using T = typename std::tuple_element<0, TypeParam>::type;
+  using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
+
+  T theval = (T)0;
+  T * memaddr = &theval;
+
+  // explicit constructor with memory address
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+
+  // test inc ops
+  T val2 = ++test1;
+  T val3 = test1++;
+  ASSERT_EQ( test1, (T)2 );
+  ASSERT_EQ( val2, (T)1 );
+  ASSERT_EQ( val3, (T)1 );
+
+  // test dec ops
+  T val4 = --test1;
+  T val5 = test1--;
+  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ( val4, (T)1 );
+  ASSERT_EQ( val5, (T)1 );
+
+  // test add/sub ops
+  T val6 = (test1 += (T)23);
+  ASSERT_EQ( test1, (T)23 );
+  ASSERT_EQ( val6, (T)23 );
+  T val7 = (test1 -= (T)22);
+  ASSERT_EQ( test1, (T)1 );
+  ASSERT_EQ( val7, (T)1 );
+
+  // test add/sub methods
+  T val8 = test1.fetch_add( (T)23 );
+  ASSERT_EQ( test1, (T)24 );
+  ASSERT_EQ( val8, (T)1 );
+  T val9 = test1.fetch_sub( (T)23 );
+  ASSERT_EQ( test1, (T)1 );
+  ASSERT_EQ( val9, (T)24 );
+}
+
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicAddSubUnitTest,
+                             BasicAddSubs
+                           );
+
+INSTANTIATE_TYPED_TEST_SUITE_P( BasicAddSubUnitTest,
+                                AtomicRefBasicAddSubUnitTest,
+                                basic_types
+                              );
+
+
+// Pure CUDA test.
+#if defined(RAJA_ENABLE_CUDA)
+// CUDA Accessors
+
+template <typename T>
+class AtomicRefCUDAAddSubUnitTest : public ::testing::Test
+{};
+
+TYPED_TEST_SUITE_P( AtomicRefCUDAAddSubUnitTest );
+
+GPU_TYPED_TEST_P( AtomicRefCUDAAddSubUnitTest, CUDAAddSubs )
+{
+  using T = typename std::tuple_element<0, TypeParam>::type;
+  using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
+
+  T * memaddr = nullptr;
+  T * result1 = nullptr;
+  T * result2 = nullptr;
+  cudaErrchk(cudaMallocManaged((void **)&memaddr, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void **)&result1, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void **)&result2, sizeof(T)));
+  memaddr[0] = (T)0;
+  cudaErrchk(cudaDeviceSynchronize());
+
+  // explicit constructor with memory address
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+
+  // test inc ops
+  forone<forone_cuda>( [=] __device__ () {result1[0] = ++test1;} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( result1[0], (T)1 );
+  forone<forone_cuda>( [=] __device__ () {result2[0] = test1++;} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( test1, (T)2 );
+  ASSERT_EQ( result2[0], (T)1 );
+
+  // test dec ops
+  forone<forone_cuda>( [=] __device__ () {result1[0] = --test1;} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( result1[0], (T)1 );
+  forone<forone_cuda>( [=] __device__ () {result2[0] = test1--;} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ( result2[0], (T)1 );
+
+  // test add/sub ops
+  forone<forone_cuda>( [=] __device__ () {result1[0] = (test1 += (T)23);} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( test1, (T)23 );
+  ASSERT_EQ( result1[0], (T)23 );
+  forone<forone_cuda>( [=] __device__ () {result2[0] = (test1 -= (T)22);} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( test1, (T)1 );
+  ASSERT_EQ( result2[0], (T)1 );
+
+  // test add/sub methods
+  forone<forone_cuda>( [=] __device__ () {result1[0] = test1.fetch_add( (T)23 );} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( test1, (T)24 );
+  ASSERT_EQ( result1[0], (T)1 );
+  forone<forone_cuda>( [=] __device__ () {result2[0] = test1.fetch_sub( (T)23 );} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( test1, (T)1 );
+  ASSERT_EQ( result2[0], (T)24 );
+
+  cudaErrchk(cudaDeviceSynchronize());
+  cudaErrchk(cudaFree(memaddr));
+  cudaErrchk(cudaFree(result1));
+  cudaErrchk(cudaFree(result2));
+}
+
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAAddSubUnitTest,
+                             CUDAAddSubs
+                           );
+
+INSTANTIATE_TYPED_TEST_SUITE_P( CUDAAddSubUnitTest,
+                                AtomicRefCUDAAddSubUnitTest,
+                                CUDA_types
+                              );
+#endif
+
diff --git a/test/unit/atomic/test-atomic-ref-bitwise.cpp b/test/unit/atomic/test-atomic-ref-bitwise.cpp
new file mode 100644
index 0000000000..c4b26c463d
--- /dev/null
+++ b/test/unit/atomic/test-atomic-ref-bitwise.cpp
@@ -0,0 +1,183 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for atomic bit methods
+///
+
+#include "RAJA/RAJA.hpp"
+
+#include "RAJA_gtest.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+#include "RAJA_unit-test-forone.hpp"
+#endif
+
+// Basic Bitwise
+
+template <typename T>
+class AtomicRefBasicBitwiseUnitTest : public ::testing::Test
+{};
+
+TYPED_TEST_SUITE_P( AtomicRefBasicBitwiseUnitTest );
+
+TYPED_TEST_P( AtomicRefBasicBitwiseUnitTest, BasicBitwises )
+{
+  using T = typename std::tuple_element<0, TypeParam>::type;
+  using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
+
+  T theval = (T)1;
+  T * memaddr = &theval;
+  T result;
+
+  // explicit constructor with memory address
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+
+  // test and/or
+  result = test1.fetch_and( (T)0 );
+  ASSERT_EQ( result, (T)1 );
+  ASSERT_EQ( test1, (T)0 );
+
+  result = test1.fetch_or( (T)1 );
+  ASSERT_EQ( result, (T)0 );
+  ASSERT_EQ( test1, (T)1 );
+
+  result = (test1 &= (T)0);
+  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ( result, (T)0 );
+
+  result = (test1 |= (T)1);
+  ASSERT_EQ( test1, (T)1 );
+  ASSERT_EQ( result, (T)1 );
+
+  // test xor
+  result = test1.fetch_xor( (T)1 );
+  ASSERT_EQ( result, (T)1 );
+  ASSERT_EQ( test1, (T)0 );
+
+  result = (test1 ^= (T)1);
+  ASSERT_EQ( test1, (T)1 );
+  ASSERT_EQ( result, (T)1 );
+}
+
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicBitwiseUnitTest,
+                             BasicBitwises
+                           );
+
+using basic_types = 
+    ::testing::Types<
+                      std::tuple<int, RAJA::builtin_atomic>,
+                      std::tuple<int, RAJA::seq_atomic>,
+                      std::tuple<unsigned int, RAJA::builtin_atomic>,
+                      std::tuple<unsigned int, RAJA::seq_atomic>,
+                      std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                      std::tuple<unsigned long long int, RAJA::seq_atomic>
+#if defined(RAJA_ENABLE_OPENMP)
+                      ,
+                      std::tuple<int, RAJA::omp_atomic>,
+                      std::tuple<unsigned int, RAJA::omp_atomic>,
+                      std::tuple<unsigned long long int, RAJA::omp_atomic>
+#endif
+#if defined(RAJA_ENABLE_CUDA)
+                      ,
+                      std::tuple<int, RAJA::auto_atomic>,
+                      std::tuple<int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned int, RAJA::auto_atomic>,
+                      std::tuple<unsigned int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                      std::tuple<unsigned long long int, RAJA::cuda_atomic>
+#endif
+                    >;
+
+INSTANTIATE_TYPED_TEST_SUITE_P( BasicBitwiseUnitTest,
+                                AtomicRefBasicBitwiseUnitTest,
+                                basic_types
+                              );
+
+
+// Pure CUDA test.
+#if defined(RAJA_ENABLE_CUDA)
+// CUDA Accessors
+
+template <typename T>
+class AtomicRefCUDABitwiseUnitTest : public ::testing::Test
+{};
+
+TYPED_TEST_SUITE_P( AtomicRefCUDABitwiseUnitTest );
+
+GPU_TYPED_TEST_P( AtomicRefCUDABitwiseUnitTest, CUDABitwises )
+{
+  using T = typename std::tuple_element<0, TypeParam>::type;
+  using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
+
+  T * memaddr = nullptr;
+  T * result = nullptr;
+  cudaErrchk(cudaMallocManaged((void **)&memaddr, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void **)&result, sizeof(T)));
+  memaddr[0] = (T)1;
+  cudaErrchk(cudaDeviceSynchronize());
+
+  // explicit constructor with memory address
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+
+  // test and/or
+  forone<forone_cuda>( [=] __device__ () {result[0] = test1.fetch_and( (T)0 );} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( result[0], (T)1 );
+  ASSERT_EQ( test1, (T)0 );
+
+  forone<forone_cuda>( [=] __device__ () {result[0] = test1.fetch_or( (T)1 );} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( result[0], (T)0 );
+  ASSERT_EQ( test1, (T)1 );
+
+  forone<forone_cuda>( [=] __device__ () {result[0] = (test1 &= (T)0);} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ( result[0], (T)0 );
+
+  forone<forone_cuda>( [=] __device__ () {result[0] = (test1 |= (T)1);} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( test1, (T)1 );
+  ASSERT_EQ( result[0], (T)1 );
+
+  // test xor
+  forone<forone_cuda>( [=] __device__ () {result[0] = test1.fetch_xor( (T)1 );} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( result[0], (T)1 );
+  ASSERT_EQ( test1, (T)0 );
+
+  forone<forone_cuda>( [=] __device__ () {result[0] = (test1 ^= (T)1);} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( test1, (T)1 );
+  ASSERT_EQ( result[0], (T)1 );
+
+  cudaErrchk(cudaDeviceSynchronize());
+  cudaErrchk(cudaFree(memaddr));
+  cudaErrchk(cudaFree(result));
+}
+
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDABitwiseUnitTest,
+                             CUDABitwises
+                           );
+
+using CUDA_types = 
+    ::testing::Types<
+                      std::tuple<int, RAJA::auto_atomic>,
+                      std::tuple<int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned int, RAJA::auto_atomic>,
+                      std::tuple<unsigned int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                      std::tuple<unsigned long long int, RAJA::cuda_atomic>
+                    >;
+
+INSTANTIATE_TYPED_TEST_SUITE_P( CUDABitwiseUnitTest,
+                                AtomicRefCUDABitwiseUnitTest,
+                                CUDA_types
+                              );
+#endif
+
diff --git a/test/unit/atomic/test-atomic-ref-constructor.cpp b/test/unit/atomic/test-atomic-ref-constructor.cpp
new file mode 100644
index 0000000000..7aee57d84c
--- /dev/null
+++ b/test/unit/atomic/test-atomic-ref-constructor.cpp
@@ -0,0 +1,152 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for atomic ref constructors (and use of getPointer for verification)
+///
+
+#include "RAJA/RAJA.hpp"
+
+#include "RAJA_gtest.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+#include "RAJA_unit-test-forone.hpp"
+#endif
+
+#include "test-atomic-ref.hpp"
+
+// Default constructors with basic types
+
+template <typename T>
+class AtomicRefDefaultConstructorUnitTest : public ::testing::Test
+{};
+
+TYPED_TEST_SUITE_P(AtomicRefDefaultConstructorUnitTest);
+
+template <typename T>
+void DefaultPolConstructors()
+{
+  T * memaddr = nullptr;
+
+  // explicit constructor with memory address
+  RAJA::AtomicRef<T> test1( memaddr );
+
+  ASSERT_EQ( test1.getPointer(), nullptr );
+
+  // ref constructor
+  RAJA::AtomicRef<T> const & reft1 = test1;
+  RAJA::AtomicRef<T> reftest1( reft1 );
+
+  ASSERT_EQ( reftest1.getPointer(), nullptr );
+}
+
+TYPED_TEST_P( AtomicRefDefaultConstructorUnitTest, DefaultPolConstructors )
+{
+  DefaultPolConstructors<TypeParam>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefDefaultConstructorUnitTest,
+                             DefaultPolConstructors
+                           );
+
+using default_types = ::testing::Types< int,
+                                      float,
+                                      double
+                                    >;
+
+INSTANTIATE_TYPED_TEST_SUITE_P( DefaultConstrUnitTest,
+                                AtomicRefDefaultConstructorUnitTest,
+                                default_types
+                              );
+
+// Basic Constructors with policies
+
+template <typename T>
+class AtomicRefBasicConstructorUnitTest : public ::testing::Test
+{};
+
+TYPED_TEST_SUITE_P( AtomicRefBasicConstructorUnitTest );
+
+TYPED_TEST_P( AtomicRefBasicConstructorUnitTest, BasicConstructors )
+{
+  using NumericType = typename std::tuple_element<0, TypeParam>::type;
+  using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
+
+  NumericType * memaddr = nullptr;
+
+  // explicit constructor with memory address
+  RAJA::AtomicRef<NumericType, AtomicPolicy> test1( memaddr );
+
+  ASSERT_EQ( test1.getPointer(), nullptr );
+
+  // ref constructor
+  RAJA::AtomicRef<NumericType, AtomicPolicy> const & reft1 = test1;
+  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1( reft1 );
+
+  ASSERT_EQ( reftest1.getPointer(), nullptr );
+}
+
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicConstructorUnitTest,
+                             BasicConstructors
+                           );
+
+INSTANTIATE_TYPED_TEST_SUITE_P( BasicConstrUnitTest,
+                                AtomicRefBasicConstructorUnitTest,
+                                basic_types
+                              );
+
+// Pure CUDA test.
+#if defined(RAJA_ENABLE_CUDA)
+// CUDA Constructors with policies
+
+template <typename T>
+class AtomicRefCUDAConstructorUnitTest : public ::testing::Test
+{};
+
+TYPED_TEST_SUITE_P(AtomicRefCUDAConstructorUnitTest);
+
+GPU_TYPED_TEST_P( AtomicRefCUDAConstructorUnitTest, CUDAConstructors )
+{
+  using NumericType = typename std::tuple_element<0, TypeParam>::type;
+  using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
+
+  NumericType * memaddr = nullptr;
+  NumericType * proxy = nullptr;
+  cudaErrchk(cudaMallocManaged((void **)&proxy, sizeof(NumericType)));
+  proxy = memaddr;
+  cudaErrchk(cudaDeviceSynchronize());
+
+  // explicit constructor with memory address
+  RAJA::AtomicRef<NumericType, AtomicPolicy> test0( memaddr );
+  RAJA::AtomicRef<NumericType, AtomicPolicy> test1( proxy );
+
+  forone<forone_cuda>( [=] __device__ () {test1.getPointer();} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( test0.getPointer(), nullptr );
+  ASSERT_EQ( test1.getPointer(), nullptr );
+
+  // ref constructor
+  RAJA::AtomicRef<NumericType, AtomicPolicy> const & reft1 = test1;
+  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1( reft1 );
+  forone<forone_cuda>( [=] __device__ () {reftest1.getPointer();} );
+  cudaErrchk(cudaDeviceSynchronize());
+
+  ASSERT_EQ( reftest1.getPointer(), nullptr );
+
+  cudaErrchk(cudaFree(proxy));
+}
+
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAConstructorUnitTest,
+                             CUDAConstructors
+                           );
+
+INSTANTIATE_TYPED_TEST_SUITE_P( CUDAConstrUnitTest,
+                                AtomicRefCUDAConstructorUnitTest,
+                                CUDA_types
+                              );
+#endif
+
diff --git a/test/unit/atomic/test-atomic-ref-exchanges.cpp b/test/unit/atomic/test-atomic-ref-exchanges.cpp
new file mode 100644
index 0000000000..c7e2026385
--- /dev/null
+++ b/test/unit/atomic/test-atomic-ref-exchanges.cpp
@@ -0,0 +1,201 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for atomic exchange and swap methods
+///
+
+#include "RAJA/RAJA.hpp"
+
+#include "RAJA_gtest.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+#include "RAJA_unit-test-forone.hpp"
+#endif
+
+// Basic Exchange
+
+template <typename T>
+class AtomicRefBasicExchangeUnitTest : public ::testing::Test
+{};
+
+TYPED_TEST_SUITE_P( AtomicRefBasicExchangeUnitTest );
+
+TYPED_TEST_P( AtomicRefBasicExchangeUnitTest, BasicExchanges )
+{
+  using T = typename std::tuple_element<0, TypeParam>::type;
+  using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
+
+  T swapper = (T)91;
+  T theval = (T)0;
+  T * memaddr = &theval;
+
+  // explicit constructor with memory address
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+
+  // test exchange method
+  swapper = test1.exchange( swapper );
+  ASSERT_EQ( test1, (T)91 );
+  ASSERT_EQ( swapper, (T)0 );
+
+  // test CAS method
+  swapper = test1.CAS( (T)91, swapper );
+  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ( swapper, (T)91 );
+
+
+  bool result = true;
+  T testval = (T)19;
+  T & valref = testval;
+
+  // test strong exchange method
+  result = test1.compare_exchange_strong( valref, testval );
+  ASSERT_EQ( result, false );
+  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ( swapper, (T)91 );
+  ASSERT_EQ( testval, (T)0 );
+
+  // test weak exchange method (same as strong exchange)
+  result = test1.compare_exchange_weak( valref, swapper );
+  ASSERT_EQ( result, true );
+  ASSERT_EQ( test1, (T)91 );
+  ASSERT_EQ( swapper, (T)91 );
+  ASSERT_EQ( testval, (T)0 );
+}
+
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicExchangeUnitTest,
+                             BasicExchanges
+                           );
+
+using basic_types = 
+    ::testing::Types<
+                      std::tuple<int, RAJA::builtin_atomic>,
+                      std::tuple<int, RAJA::seq_atomic>,
+                      std::tuple<unsigned int, RAJA::builtin_atomic>,
+                      std::tuple<unsigned int, RAJA::seq_atomic>,
+                      std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                      std::tuple<unsigned long long int, RAJA::seq_atomic>,
+                      std::tuple<float, RAJA::builtin_atomic>,
+                      std::tuple<float, RAJA::seq_atomic>,
+                      std::tuple<double, RAJA::builtin_atomic>,
+                      std::tuple<double, RAJA::seq_atomic>
+#if defined(RAJA_ENABLE_OPENMP)
+                      ,
+                      std::tuple<int, RAJA::omp_atomic>,
+                      std::tuple<unsigned int, RAJA::omp_atomic>,
+                      std::tuple<unsigned long long int, RAJA::omp_atomic>,
+                      std::tuple<float, RAJA::omp_atomic>,
+                      std::tuple<double, RAJA::omp_atomic>
+#endif
+#if defined(RAJA_ENABLE_CUDA)
+                      ,
+                      std::tuple<int, RAJA::auto_atomic>,
+                      std::tuple<int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned int, RAJA::auto_atomic>,
+                      std::tuple<unsigned int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                      std::tuple<float, RAJA::auto_atomic>,
+                      std::tuple<float, RAJA::cuda_atomic>
+#endif
+                    >;
+
+INSTANTIATE_TYPED_TEST_SUITE_P( BasicExchangeUnitTest,
+                                AtomicRefBasicExchangeUnitTest,
+                                basic_types
+                              );
+
+
+// Pure CUDA test.
+#if defined(RAJA_ENABLE_CUDA)
+// CUDA Accessors
+
+template <typename T>
+class AtomicRefCUDAExchangeUnitTest : public ::testing::Test
+{};
+
+TYPED_TEST_SUITE_P( AtomicRefCUDAExchangeUnitTest );
+
+GPU_TYPED_TEST_P( AtomicRefCUDAExchangeUnitTest, CUDAExchanges )
+{
+  using T = typename std::tuple_element<0, TypeParam>::type;
+  using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
+
+  T * swapper = nullptr;
+  T * memaddr = nullptr;
+  T * testval = nullptr;
+  bool * result = nullptr;
+  cudaErrchk(cudaMallocManaged(&swapper, sizeof(T)));
+  cudaErrchk(cudaMallocManaged(&memaddr, sizeof(T)));
+  cudaErrchk(cudaMallocManaged(&testval, sizeof(T)));
+  cudaErrchk(cudaMallocManaged(&result, sizeof(bool)));
+  swapper[0] = (T)91;
+  memaddr[0] = (T)0;
+  testval[0] = (T)19;
+  result[0] = true;
+  cudaErrchk(cudaDeviceSynchronize());
+
+  // explicit constructor with memory address
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+
+  // test exchange method
+  forone<forone_cuda>( [=] __device__ () {swapper[0] = test1.exchange( swapper[0] );} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( test1, (T)91 );
+  ASSERT_EQ( swapper[0], (T)0 );
+
+  // test CAS method
+  forone<forone_cuda>( [=] __device__ () {swapper[0] = test1.CAS( (T)91, swapper[0] );} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ( swapper[0], (T)91 );
+
+  // test strong exchange method
+  forone<forone_cuda>( [=] __device__ () {result[0] = test1.compare_exchange_strong( testval[0], testval[0] );} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( result[0], false );
+  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ( swapper[0], (T)91 );
+  ASSERT_EQ( testval[0], (T)0 );
+
+  // test weak exchange method (same as strong exchange)
+  forone<forone_cuda>( [=] __device__ () {result[0] = test1.compare_exchange_weak( testval[0], swapper[0] );} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( result[0], true );
+  ASSERT_EQ( test1, (T)91 );
+  ASSERT_EQ( swapper[0], (T)91 );
+  ASSERT_EQ( testval[0], (T)0 );
+
+  cudaErrchk(cudaDeviceSynchronize());
+  cudaErrchk(cudaFree(swapper));
+  cudaErrchk(cudaFree(memaddr));
+  cudaErrchk(cudaFree(testval));
+  cudaErrchk(cudaFree(result));
+}
+
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAExchangeUnitTest,
+                             CUDAExchanges
+                           );
+
+using CUDA_types = 
+    ::testing::Types<
+                      std::tuple<int, RAJA::auto_atomic>,
+                      std::tuple<int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned int, RAJA::auto_atomic>,
+                      std::tuple<unsigned int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                      std::tuple<float, RAJA::auto_atomic>,
+                      std::tuple<float, RAJA::auto_atomic>
+                    >;
+
+INSTANTIATE_TYPED_TEST_SUITE_P( CUDAExchangeUnitTest,
+                                AtomicRefCUDAExchangeUnitTest,
+                                CUDA_types
+                              );
+#endif
+
diff --git a/test/unit/atomic/test-atomic-ref-minmax.cpp b/test/unit/atomic/test-atomic-ref-minmax.cpp
new file mode 100644
index 0000000000..e6225945d2
--- /dev/null
+++ b/test/unit/atomic/test-atomic-ref-minmax.cpp
@@ -0,0 +1,131 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for atomic min and max methods
+///
+
+#include "RAJA/RAJA.hpp"
+
+#include "RAJA_gtest.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+#include "RAJA_unit-test-forone.hpp"
+#endif
+
+#include "test-atomic-ref.hpp"
+
+// Basic MinMax
+
+template <typename T>
+class AtomicRefBasicMinMaxUnitTest : public ::testing::Test
+{};
+
+TYPED_TEST_SUITE_P( AtomicRefBasicMinMaxUnitTest );
+
+TYPED_TEST_P( AtomicRefBasicMinMaxUnitTest, BasicMinMaxs )
+{
+  using T = typename std::tuple_element<0, TypeParam>::type;
+  using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
+
+  T theval = (T)91;
+  T * memaddr = &theval;
+  T result;
+
+  // explicit constructor with memory address
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+
+  // test min
+  result = test1.fetch_min( (T)87 );
+  ASSERT_EQ( result, (T)91 );
+  ASSERT_EQ( test1, (T)87 );
+
+  result = test1.min( (T)83 );
+  ASSERT_EQ( result, (T)83 );
+  ASSERT_EQ( test1, (T)83 );
+
+  // test max
+  result = test1.fetch_max( (T)87 );
+  ASSERT_EQ( result, (T)83 );
+  ASSERT_EQ( test1, (T)87 );
+
+  result = test1.max( (T)91 );
+  ASSERT_EQ( result, (T)91 );
+  ASSERT_EQ( test1, (T)91 );
+}
+
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicMinMaxUnitTest,
+                             BasicMinMaxs
+                           );
+
+INSTANTIATE_TYPED_TEST_SUITE_P( BasicMinMaxUnitTest,
+                                AtomicRefBasicMinMaxUnitTest,
+                                basic_types
+                              );
+
+// Pure CUDA test.
+#if defined(RAJA_ENABLE_CUDA)
+// CUDA Accessors
+
+template <typename T>
+class AtomicRefCUDAMinMaxUnitTest : public ::testing::Test
+{};
+
+TYPED_TEST_SUITE_P( AtomicRefCUDAMinMaxUnitTest );
+
+GPU_TYPED_TEST_P( AtomicRefCUDAMinMaxUnitTest, CUDAMinMaxs )
+{
+  using T = typename std::tuple_element<0, TypeParam>::type;
+  using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
+
+  T * result = nullptr;
+  T * memaddr = nullptr;
+  cudaErrchk(cudaMallocManaged(&result, sizeof(T)));
+  cudaErrchk(cudaMallocManaged(&memaddr, sizeof(T)));
+  memaddr[0] = (T)91;
+  cudaErrchk(cudaDeviceSynchronize());
+
+  // explicit constructor with memory address
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+
+  // test min
+  forone<forone_cuda>( [=] __device__ () {result[0] = test1.fetch_min( (T)87 );} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( result[0], (T)91 );
+  ASSERT_EQ( test1, (T)87 );
+
+  forone<forone_cuda>( [=] __device__ () {result[0] = test1.min( (T)83 );} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( result[0], (T)83 );
+  ASSERT_EQ( test1, (T)83 );
+
+  // test max
+  forone<forone_cuda>( [=] __device__ () {result[0] = test1.fetch_max( (T)87 );} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( result[0], (T)83 );
+  ASSERT_EQ( test1, (T)87 );
+
+  forone<forone_cuda>( [=] __device__ () {result[0] = test1.max( (T)91 );} );
+  cudaErrchk(cudaDeviceSynchronize());
+  ASSERT_EQ( result[0], (T)91 );
+  ASSERT_EQ( test1, (T)91 );
+
+  cudaErrchk(cudaDeviceSynchronize());
+  cudaErrchk(cudaFree(result));
+  cudaErrchk(cudaFree(memaddr));
+}
+
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAMinMaxUnitTest,
+                             CUDAMinMaxs
+                           );
+
+INSTANTIATE_TYPED_TEST_SUITE_P( CUDAMinMaxUnitTest,
+                                AtomicRefCUDAMinMaxUnitTest,
+                                CUDA_types
+                              );
+#endif
+
diff --git a/test/unit/atomic/test-atomic-ref.hpp b/test/unit/atomic/test-atomic-ref.hpp
new file mode 100644
index 0000000000..09fedb384a
--- /dev/null
+++ b/test/unit/atomic/test-atomic-ref.hpp
@@ -0,0 +1,65 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing test types for atomic ref unit tests
+///
+
+#include <RAJA/RAJA.hpp>
+#include "RAJA_gtest.hpp"
+
+using basic_types = 
+    ::testing::Types<
+                      std::tuple<int, RAJA::builtin_atomic>,
+                      std::tuple<int, RAJA::seq_atomic>,
+                      std::tuple<unsigned int, RAJA::builtin_atomic>,
+                      std::tuple<unsigned int, RAJA::seq_atomic>,
+                      std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                      std::tuple<unsigned long long int, RAJA::seq_atomic>,
+                      std::tuple<float, RAJA::builtin_atomic>,
+                      std::tuple<float, RAJA::seq_atomic>,
+                      std::tuple<double, RAJA::builtin_atomic>,
+                      std::tuple<double, RAJA::seq_atomic>
+#if defined(RAJA_ENABLE_OPENMP)
+                      ,
+                      std::tuple<int, RAJA::omp_atomic>,
+                      std::tuple<unsigned int, RAJA::omp_atomic>,
+                      std::tuple<unsigned long long int, RAJA::omp_atomic>,
+                      std::tuple<float, RAJA::omp_atomic>,
+                      std::tuple<double, RAJA::omp_atomic>
+#endif
+#if defined(RAJA_ENABLE_CUDA)
+                      ,
+                      std::tuple<int, RAJA::auto_atomic>,
+                      std::tuple<int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned int, RAJA::auto_atomic>,
+                      std::tuple<unsigned int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                      std::tuple<float, RAJA::auto_atomic>,
+                      std::tuple<float, RAJA::cuda_atomic>,
+                      std::tuple<double, RAJA::auto_atomic>,
+                      std::tuple<double, RAJA::cuda_atomic>
+#endif
+                    >;
+
+#if defined(RAJA_ENABLE_CUDA)
+using CUDA_types = 
+    ::testing::Types<
+                      std::tuple<int, RAJA::auto_atomic>,
+                      std::tuple<int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned int, RAJA::auto_atomic>,
+                      std::tuple<unsigned int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                      std::tuple<float, RAJA::auto_atomic>,
+                      std::tuple<float, RAJA::auto_atomic>,
+                      std::tuple<double, RAJA::cuda_atomic>,
+                      std::tuple<double, RAJA::cuda_atomic>
+                    >;
+#endif
+
diff --git a/test/unit/cpu/CMakeLists.txt b/test/unit/cpu/CMakeLists.txt
deleted file mode 100644
index 9d44213a82..0000000000
--- a/test/unit/cpu/CMakeLists.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-###############################################################################
-# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-#
-# SPDX-License-Identifier: (BSD-3-Clause)
-###############################################################################
-
-blt_add_library(
-  NAME bis
-  SOURCES buildIndexSet.cpp
-  DEPENDS_ON RAJA ${raja_depends})
-
-raja_add_test(
-  NAME test-reduce
-  SOURCES test-reduce.cpp
-  DEPENDS_ON bis)
-
-raja_add_test(
-  NAME test-forall
-  SOURCES test-forall.cpp
-  DEPENDS_ON bis)
-
-raja_add_test(
-  NAME test-indexset
-  SOURCES test-indexset.cpp
-  DEPENDS_ON bis)
-
-raja_add_test(
-  NAME test-segments
-  SOURCES test-segments.cpp)
-
-raja_add_test(
-  NAME test-scan
-  SOURCES test-scan.cpp)
-
-raja_add_test(
-  NAME test-reductions
-  SOURCES test-reductions.cpp)
-
-raja_add_test(
-  NAME test-forall-view
-  SOURCES test-forall-view.cpp)
-
-raja_add_test(
-  NAME test-synchronize
-  SOURCES test-synchronize.cpp)
diff --git a/test/unit/cpu/buildIndexSet.cpp b/test/unit/cpu/buildIndexSet.cpp
deleted file mode 100644
index 921440b7e8..0000000000
--- a/test/unit/cpu/buildIndexSet.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing methods that build various index sets for testing...
-///
-
-#include "buildIndexSet.hpp"
-
-#include <vector>
-
-using namespace RAJA;
-using namespace std;
-
-
-//
-//  Initialize hybrid index set by adding segments as index set objects.
-//
-Index_type buildIndexSet(UnitIndexSet* hindex, IndexSetBuildMethod build_method)
-{
-  //
-  // Record last index in index set for return.
-  //
-  Index_type last_indx = 0;
-
-  //
-  // Build vector of integers for creating ListSegments.
-  //
-  Index_type lindx_end = 0;
-  RAJAVec<Index_type> lindices;
-  for (Index_type i = 0; i < 5; ++i) {
-    Index_type istart = lindx_end;
-    lindices.push_back(istart + 1);
-    lindices.push_back(istart + 4);
-    lindices.push_back(istart + 5);
-    lindices.push_back(istart + 9);
-    lindices.push_back(istart + 10);
-    lindices.push_back(istart + 11);
-    lindices.push_back(istart + 12);
-    lindices.push_back(istart + 14);
-    lindices.push_back(istart + 15);
-    lindices.push_back(istart + 21);
-    lindices.push_back(istart + 27);
-    lindices.push_back(istart + 28);
-    lindx_end = istart + 28;
-  }
-
-  //
-  // Create a vector of interleaved Range and List segments.
-  //
-
-  const int seg_chunk_size = 5;
-  UnitIndexSet iset_master;
-
-  for (int i = 0; i < seg_chunk_size; ++i) {
-    Index_type rbeg;
-    Index_type rend;
-    Index_type lseg_len = lindices.size();
-    RAJAVec<Index_type> lseg(lseg_len);
-
-    // Create Range segment
-    rbeg = last_indx + 2;
-    rend = rbeg + 32;
-    iset_master.push_back(RangeSegment(rbeg, rend));
-    last_indx = rend;
-
-    // Create List segment
-    for (Index_type i = 0; i < lseg_len; ++i) {
-      lseg[i] = lindices[i] + last_indx;
-    }
-    iset_master.push_back(ListSegment(&lseg[0], lseg_len));
-    last_indx = lseg[lseg_len - 1];
-
-    // Create Range segment
-    rbeg = last_indx + 16;
-    rend = rbeg + 128;
-    iset_master.push_back(RangeSegment(rbeg, rend));
-    last_indx = rend;
-
-    // Create Range segment
-    rbeg = last_indx + 4;
-    rend = rbeg + 256;
-    iset_master.push_back(RangeSegment(rbeg, rend));
-    last_indx = rend;
-
-    // Create List segment
-    for (Index_type i = 0; i < lseg_len; ++i) {
-      lseg[i] = lindices[i] + last_indx + 5;
-    }
-    iset_master.push_back(ListSegment(&lseg[0], lseg_len));
-    last_indx = lseg[lseg_len - 1];
-  }
-
-#if 0  // print index set for debugging
-  cout << "\n\nUnitIndexSet( master ) " << endl;
-  iset_master.print(cout);
-#endif
-
-  //
-  // Generate UnitIndexSet from segments using specified build method.
-  //
-  switch (build_method) {
-
-    // This is already being done above as iset_master
-    case AddSegments: {
-      // This is already being done above as iset_master
-      for (size_t i = 0; i < iset_master.getNumSegments(); ++i) {
-        iset_master.segment_push_into(i,
-                                      hindex[build_method],
-                                      PUSH_BACK,
-                                      PUSH_COPY);
-      }
-      break;
-    }
-
-    case AddSegmentsReverse: {
-      UnitIndexSet& iset_master = hindex[0];
-      for (int i = iset_master.getNumSegments() - 1; i >= 0; --i) {
-        iset_master.segment_push_into(i,
-                                      hindex[build_method],
-                                      PUSH_FRONT,
-                                      PUSH_COPY);
-      }
-
-      break;
-    }
-
-    case AddSegmentsNoCopy: {
-      UnitIndexSet& iset_master = hindex[0];
-      for (size_t i = 0; i < iset_master.getNumSegments(); ++i) {
-        iset_master.segment_push_into(i,
-                                      hindex[build_method],
-                                      PUSH_BACK,
-                                      PUSH_NOCOPY);
-      }
-
-      break;
-    }
-
-    case AddSegmentsNoCopyReverse: {
-      UnitIndexSet& iset_master = hindex[0];
-      for (int i = iset_master.getNumSegments() - 1; i >= 0; --i) {
-        iset_master.segment_push_into(i,
-                                      hindex[build_method],
-                                      PUSH_FRONT,
-                                      PUSH_NOCOPY);
-      }
-
-      break;
-    }
-
-    case MakeSliceRange: {
-      UnitIndexSet& iset_master = hindex[0];
-      size_t num_segs = iset_master.getNumSegments();
-      UnitIndexSet iset_slice = iset_master.createSlice(0, num_segs);
-
-      for (size_t i = 0; i < iset_slice.getNumSegments(); ++i) {
-        iset_slice.segment_push_into(i,
-                                     hindex[build_method],
-                                     PUSH_BACK,
-                                     PUSH_NOCOPY);
-      }
-
-      break;
-    }
-
-    case MakeSliceArray: {
-      UnitIndexSet& iset_master = hindex[0];
-      size_t num_segs = iset_master.getNumSegments();
-      int* segIds = new int[num_segs];
-
-      for (size_t i = 0; i < num_segs; ++i) {
-        segIds[i] = i;
-      }
-
-      UnitIndexSet iset_slice = iset_master.createSlice(segIds, num_segs);
-
-      for (size_t i = 0; i < iset_slice.getNumSegments(); ++i) {
-        iset_slice.segment_push_into(i,
-                                     hindex[build_method],
-                                     PUSH_BACK,
-                                     PUSH_NOCOPY);
-      }
-
-      delete[] segIds;
-
-      break;
-    }
-
-#if defined(RAJA_USE_STL)
-    case MakeSliceVector: {
-      UnitIndexSet& iset_master = hindex[0];
-      size_t num_segs = iset_master.getNumSegments();
-      std::vector<int> segIds(num_segs);
-      for (int i = 0; i < num_segs; ++i) {
-        segIds[i] = i;
-      }
-
-      UnitIndexSet iset_slice = iset_master.createSlice(segIds);
-
-      for (size_t i = 0; i < iset_slice.getNumSegments(); ++i) {
-        iset_slice.segment_push_into(i,
-                                     hindex[build_method],
-                                     PUSH_BACK,
-                                     PUSH_NOCOPY);
-      }
-
-      break;
-    }
-#endif
-
-    default: {
-    }
-
-  }  // end switch (build_method)
-
-  return last_indx;
-}
diff --git a/test/unit/cpu/buildIndexSet.hpp b/test/unit/cpu/buildIndexSet.hpp
deleted file mode 100644
index ae464334af..0000000000
--- a/test/unit/cpu/buildIndexSet.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Header file defining methods that build index sets in various ways
-/// for testing...
-///
-
-#include "RAJA/RAJA.hpp"
-
-using UnitIndexSet = RAJA::TypedIndexSet<RAJA::RangeSegment,
-                                         RAJA::ListSegment,
-                                         RAJA::RangeStrideSegment>;
-
-//
-// Enum for different hybrid initialization procedures.
-//
-enum IndexSetBuildMethod {
-  AddSegments = 0,
-  AddSegmentsReverse,
-  AddSegmentsNoCopy,
-  AddSegmentsNoCopyReverse,
-  MakeSliceRange,
-  MakeSliceArray,
-#if defined(RAJA_USE_STL)
-  MakeViewVector,
-#endif
-
-  NumBuildMethods
-};
-
-//
-//  Initialize index set by adding segments as indicated by enum value.
-//  Return last index in IndexSet.
-//
-RAJA::Index_type buildIndexSet(UnitIndexSet* hindex,
-                               IndexSetBuildMethod use_vector);
diff --git a/test/unit/cpu/test-forall-view.cpp b/test/unit/cpu/test-forall-view.cpp
deleted file mode 100644
index 7287f0bc29..0000000000
--- a/test/unit/cpu/test-forall-view.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#include <cstdlib>
-
-#include <string>
-
-#include "RAJA/RAJA.hpp"
-#include "RAJA/policy/tbb/policy.hpp"
-#include "gtest/gtest.h"
-
-using namespace RAJA;
-using namespace std;
-
-template <typename POLICY_T>
-class ForallViewTest : public ::testing::Test
-{
-protected:
-  Real_ptr arr;
-  const Index_type alen = 100000;
-  Real_type test_val = 0.123;
-
-  virtual void SetUp()
-  {
-    arr = (Real_ptr)allocate_aligned(DATA_ALIGN, alen * sizeof(Real_type));
-
-    for (Index_type i = 0; i < alen; ++i) {
-      arr[i] = Real_type(rand() % 65536);
-    }
-  }
-
-  virtual void TearDown() { free_aligned(arr); }
-};
-
-TYPED_TEST_SUITE_P(ForallViewTest);
-
-TYPED_TEST_P(ForallViewTest, ForallViewLayout)
-{
-  const Index_type talen = this->alen;
-  Real_ptr tarr = this->arr;
-  Real_type ttest_val = this->test_val;
-
-  const RAJA::Layout<1> my_layout(talen);
-  RAJA::View<Real_type, RAJA::Layout<1> > view(tarr, my_layout);
-
-  forall<TypeParam>(RAJA::RangeSegment(0, talen),
-                    [=](Index_type i) { view(i) = ttest_val; });
-
-  for (Index_type i = 0; i < talen; ++i) {
-    EXPECT_EQ(tarr[i], ttest_val);
-  }
-}
-
-TYPED_TEST_P(ForallViewTest, ForallViewOffsetLayout)
-{
-  const Index_type talen = this->alen;
-  Real_ptr tarr = this->arr;
-  Real_type ttest_val = this->test_val;
-
-  RAJA::OffsetLayout<1> my_layout =
-      RAJA::make_offset_layout<1>({{1}}, {{talen + 1}});
-  RAJA::View<Real_type, RAJA::OffsetLayout<1> > view(tarr, my_layout);
-
-  forall<TypeParam>(RAJA::RangeSegment(1, talen + 1),
-                    [=](Index_type i) { view(i) = ttest_val; });
-
-  for (Index_type i = 0; i < talen; ++i) {
-    EXPECT_EQ(tarr[i], ttest_val);
-  }
-}
-
-REGISTER_TYPED_TEST_SUITE_P(ForallViewTest,
-                           ForallViewLayout,
-                           ForallViewOffsetLayout);
-
-using SequentialTypes = ::testing::Types<seq_exec, loop_exec, simd_exec>;
-
-INSTANTIATE_TYPED_TEST_SUITE_P(Sequential, ForallViewTest, SequentialTypes);
-
-
-#if defined(RAJA_ENABLE_OPENMP)
-using OpenMPTypes = ::testing::Types<omp_parallel_for_exec>;
-
-INSTANTIATE_TYPED_TEST_SUITE_P(OpenMP, ForallViewTest, OpenMPTypes);
-#endif
-
-#if defined(RAJA_ENABLE_TBB)
-using TBBTypes = ::testing::Types<tbb_for_exec, tbb_for_dynamic>;
-
-INSTANTIATE_TYPED_TEST_SUITE_P(TBB, ForallViewTest, TBBTypes);
-#endif
diff --git a/test/unit/cpu/test-forall.cpp b/test/unit/cpu/test-forall.cpp
deleted file mode 100644
index 24bc4c015b..0000000000
--- a/test/unit/cpu/test-forall.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#include <cstdlib>
-
-#include <string>
-
-#include "RAJA/RAJA.hpp"
-#include "RAJA/policy/tbb/policy.hpp"
-#include "gtest/gtest.h"
-
-using namespace RAJA;
-using namespace std;
-
-#include "buildIndexSet.hpp"
-
-template <typename ISET_POLICY_T>
-class ForallTest : public ::testing::Test
-{
-protected:
-  Real_ptr in_array;
-  Index_type alen;
-  UnitIndexSet iset;
-  RAJAVec<Index_type> is_indices;
-  Real_ptr test_array;
-  Real_ptr ref_icount_array;
-  Real_ptr ref_forall_array;
-
-  virtual void SetUp()
-  {
-    // AddSegments chosen arbitrarily; index set equivalence is tested elsewhere
-    alen = buildIndexSet(&iset, IndexSetBuildMethod::AddSegments) + 1;
-
-    in_array = (Real_ptr)allocate_aligned(DATA_ALIGN, alen * sizeof(Real_type));
-
-    for (Index_type i = 0; i < alen; ++i) {
-      in_array[i] = Real_type(rand() % 65536);
-    }
-
-    getIndices(is_indices, iset);
-
-    test_array =
-        (Real_ptr)allocate_aligned(DATA_ALIGN, alen * sizeof(Real_type));
-    ref_icount_array =
-        (Real_ptr)allocate_aligned(DATA_ALIGN, alen * sizeof(Real_type));
-    ref_forall_array =
-        (Real_ptr)allocate_aligned(DATA_ALIGN, alen * sizeof(Real_type));
-
-    for (Index_type i = 0; i < alen; ++i) {
-      test_array[i] = 0.0;
-      ref_forall_array[i] = 0.0;
-      ref_icount_array[i] = 0.0;
-    }
-
-    for (size_t i = 0; i < is_indices.size(); ++i) {
-      ref_forall_array[is_indices[i]] =
-          in_array[is_indices[i]] * in_array[is_indices[i]];
-    }
-
-    for (size_t i = 0; i < is_indices.size(); ++i) {
-      ref_icount_array[i] = in_array[is_indices[i]] * in_array[is_indices[i]];
-    }
-  }
-
-  virtual void TearDown()
-  {
-    free_aligned(in_array);
-    free_aligned(test_array);
-    free_aligned(ref_icount_array);
-    free_aligned(ref_forall_array);
-  }
-};
-
-TYPED_TEST_SUITE_P(ForallTest);
-
-TYPED_TEST_P(ForallTest, BasicForall)
-{
-  forall<TypeParam>(this->iset, [=](Index_type idx) {
-    this->test_array[idx] = this->in_array[idx] * this->in_array[idx];
-  });
-
-  for (Index_type i = 0; i < this->alen; ++i) {
-    EXPECT_EQ(this->ref_forall_array[i], this->test_array[i]);
-  }
-}
-
-TYPED_TEST_P(ForallTest, BasicForallIcount)
-{
-  forall_Icount<TypeParam>(this->iset, [=](Index_type icount, Index_type idx) {
-    this->test_array[icount] = this->in_array[idx] * this->in_array[idx];
-  });
-
-  for (Index_type i = 0; i < this->alen; ++i) {
-    EXPECT_EQ(this->ref_icount_array[i], this->test_array[i]);
-  }
-}
-
-REGISTER_TYPED_TEST_SUITE_P(ForallTest, BasicForall, BasicForallIcount);
-
-using SequentialTypes = ::testing::Types<ExecPolicy<seq_segit, seq_exec>,
-                                         ExecPolicy<seq_segit, loop_exec>,
-                                         ExecPolicy<seq_segit, simd_exec> >;
-
-INSTANTIATE_TYPED_TEST_SUITE_P(Sequential, ForallTest, SequentialTypes);
-
-
-#if defined(RAJA_ENABLE_OPENMP)
-using OpenMPTypes =
-    ::testing::Types<ExecPolicy<seq_segit, omp_parallel_for_exec>,
-                     ExecPolicy<omp_parallel_for_segit, seq_exec>,
-                     ExecPolicy<omp_parallel_for_segit, loop_exec> >;
-
-INSTANTIATE_TYPED_TEST_SUITE_P(OpenMP, ForallTest, OpenMPTypes);
-#endif
-
-#if defined(RAJA_ENABLE_TBB)
-using TBBTypes = ::testing::Types<ExecPolicy<seq_segit, tbb_for_exec>,
-                                  ExecPolicy<tbb_for_exec, seq_exec>,
-                                  ExecPolicy<tbb_for_exec, loop_exec>,
-                                  ExecPolicy<seq_segit, tbb_for_dynamic>,
-                                  ExecPolicy<tbb_for_dynamic, seq_exec>,
-                                  ExecPolicy<tbb_for_dynamic, loop_exec> >;
-
-INSTANTIATE_TYPED_TEST_SUITE_P(TBB, ForallTest, TBBTypes);
-#endif
diff --git a/test/unit/cpu/test-indexset.cpp b/test/unit/cpu/test-indexset.cpp
deleted file mode 100644
index d7a6794f0e..0000000000
--- a/test/unit/cpu/test-indexset.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for RAJA index set mechanics.
-///
-
-#include "gtest/gtest.h"
-
-#include "buildIndexSet.hpp"
-
-#include "RAJA/RAJA.hpp"
-
-class IndexSetTest : public ::testing::Test
-{
-protected:
-  virtual void SetUp()
-  {
-    for (unsigned ibuild = 0; ibuild < NumBuildMethods; ++ibuild) {
-      buildIndexSet(index_sets_, static_cast<IndexSetBuildMethod>(ibuild));
-    }
-
-    getIndices(is_indices, index_sets_[0]);
-  }
-
-  RAJA::RAJAVec<RAJA::Index_type> is_indices;
-  UnitIndexSet index_sets_[NumBuildMethods];
-};
-
-TEST_F(IndexSetTest, IndexSetEquality)
-{
-  for (unsigned ibuild = 1; ibuild < NumBuildMethods; ++ibuild) {
-    EXPECT_EQ(index_sets_[ibuild], index_sets_[1]);
-  }
-}
-
-#if !defined(RAJA_COMPILER_XLC12)
-TEST_F(IndexSetTest, conditionalOperation_even_indices)
-{
-
-  RAJA::RAJAVec<RAJA::Index_type> even_indices;
-  getIndicesConditional(even_indices, index_sets_[0], [](RAJA::Index_type idx) {
-    return !(idx % 2);
-  });
-
-  RAJA::RAJAVec<RAJA::Index_type> ref_even_indices;
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    RAJA::Index_type idx = is_indices[i];
-    if (idx % 2 == 0) {
-      ref_even_indices.push_back(idx);
-    }
-  }
-
-  EXPECT_EQ(even_indices.size(), ref_even_indices.size());
-  for (size_t i = 0; i < ref_even_indices.size(); ++i) {
-    EXPECT_EQ(even_indices[i], ref_even_indices[i]);
-  }
-}
-
-TEST_F(IndexSetTest, conditionalOperation_lt300_indices)
-{
-  RAJA::RAJAVec<RAJA::Index_type> lt300_indices;
-  getIndicesConditional(lt300_indices,
-                        index_sets_[0],
-                        [](RAJA::Index_type idx) { return (idx < 300); });
-
-  RAJA::RAJAVec<RAJA::Index_type> ref_lt300_indices;
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    RAJA::Index_type idx = is_indices[i];
-    if (idx < 300) {
-      ref_lt300_indices.push_back(idx);
-    }
-  }
-
-  EXPECT_EQ(lt300_indices.size(), ref_lt300_indices.size());
-  for (size_t i = 0; i < ref_lt300_indices.size(); ++i) {
-    EXPECT_EQ(lt300_indices[i], ref_lt300_indices[i]);
-  }
-}
-#endif  // !defined(RAJA_COMPILER_XLC12)
-
-TEST(IndexSet, empty)
-{
-  RAJA::TypedIndexSet<> is;
-  ASSERT_EQ(0, is.size());
-  ASSERT_EQ(is.begin(), is.end());
-  RAJA::TypedIndexSet<> is2;
-  ASSERT_EQ(is2.size(), is.size());
-  is.swap(is2);
-  ASSERT_EQ(is2.size(), is.size());
-}
-
-TEST(IndexSet, compare)
-{
-  using RangeIndexSet = RAJA::TypedIndexSet<RAJA::RangeSegment>;
-  RangeIndexSet is1, is2;
-  is1.push_back(RAJA::RangeSegment(0, 10));
-  is2.push_back(RAJA::RangeSegment(0, 5));
-  is2.push_back(RAJA::RangeSegment(5, 10));
-  ASSERT_TRUE(is1 != is2);
-  ASSERT_FALSE(is1 == is2);
-  ASSERT_NE(is1.size(), is2.size());
-  ASSERT_EQ(is1.getLength(), is2.getLength());
-}
-
-TEST(IndexSet, swap)
-{
-  UnitIndexSet iset1;
-  RAJA::RangeSegment range(0, 10);
-  iset1.push_back(range);
-  iset1.push_back_nocopy(&range);
-  iset1.push_front(range);
-  iset1.push_front_nocopy(&range);
-  UnitIndexSet iset2;
-
-  ASSERT_EQ(4l, iset1.size());
-  ASSERT_EQ(40lu, iset1.getLength());
-  ASSERT_EQ(0l, iset2.size());
-  ASSERT_EQ(0lu, iset2.getLength());
-
-  iset1.swap(iset2);
-
-  ASSERT_EQ(4l, iset2.size());
-  ASSERT_EQ(40lu, iset2.getLength());
-  ASSERT_EQ(0l, iset1.size());
-  ASSERT_EQ(0lu, iset1.getLength());
-}
diff --git a/test/unit/cpu/test-reduce.cpp b/test/unit/cpu/test-reduce.cpp
deleted file mode 100644
index 19d7c4ec7e..0000000000
--- a/test/unit/cpu/test-reduce.cpp
+++ /dev/null
@@ -1,327 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for RAJA loop reduction operations.
-///
-
-#include <time.h>
-#include <cmath>
-#include <cstdlib>
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "RAJA/RAJA.hpp"
-#include "RAJA/internal/MemUtils_CPU.hpp"
-
-#include "buildIndexSet.hpp"
-
-using namespace RAJA;
-
-
-using TestingTypes = ::testing::Types<
-    std::tuple<ExecPolicy<seq_segit, seq_exec>, seq_reduce>,
-    std::tuple<ExecPolicy<seq_segit, loop_exec>, loop_reduce>
-#if defined(RAJA_ENABLE_OPENMP)
-
-    ,
-    std::tuple<ExecPolicy<omp_parallel_for_segit, loop_exec>, omp_reduce>,
-    std::tuple<ExecPolicy<omp_parallel_for_segit, loop_exec>,
-               omp_reduce_ordered>
-#endif
-#if defined(RAJA_ENABLE_TBB)
-    ,
-    std::tuple<ExecPolicy<seq_segit, tbb_for_exec>, tbb_reduce>,
-    std::tuple<ExecPolicy<tbb_for_exec, loop_exec>, tbb_reduce>
-#endif
-    >;
-
-template <typename Tuple>
-class IndexSetReduce : public ::testing::Test
-{
-public:
-  UnitIndexSet iset;
-  Index_type alen;
-  RAJAVec<Index_type> is_indices;
-  Real_ptr in_array;
-  Real_ptr test_array;
-
-  virtual void SetUp()
-  {
-    alen = buildIndexSet(&iset, static_cast<IndexSetBuildMethod>(0)) + 1;
-    in_array = (Real_ptr)allocate_aligned(DATA_ALIGN, alen * sizeof(Real_type));
-    test_array =
-        (Real_ptr)allocate_aligned(DATA_ALIGN, alen * sizeof(Real_type));
-    for (Index_type i = 0; i < alen; ++i) {
-      in_array[i] = Real_type(rand() % 65536);
-    }
-    getIndices(is_indices, iset);
-  }
-
-  virtual void TearDown()
-  {
-    free_aligned(in_array);
-    free_aligned(test_array);
-  }
-};
-
-TYPED_TEST_SUITE(IndexSetReduce, TestingTypes);
-
-TYPED_TEST(IndexSetReduce, ReduceMinTest)
-{
-  using ISET_POLICY_T = typename std::tuple_element<0, TypeParam>::type;
-  using REDUCE_POLICY_T = typename std::tuple_element<1, TypeParam>::type;
-
-  for (Index_type i = 0; i < this->alen; ++i) {
-    this->test_array[i] = fabs(this->in_array[i]);
-  }
-
-  const RAJA::Index_type ref_min_indx =
-      this->is_indices[this->is_indices.size() / 2];
-  const Real_type ref_min_val = -100.0;
-
-  this->test_array[ref_min_indx] = ref_min_val;
-
-  RAJA::ReduceMin<REDUCE_POLICY_T, Real_type> tmin0(1.0e+20);
-  RAJA::ReduceMin<REDUCE_POLICY_T, Real_type> tmin1(-200.0);
-  tmin1.min(-100.0);
-
-  int loops = 2;
-
-  for (int k = 1; k <= loops; ++k) {
-    RAJA::forall<ISET_POLICY_T>(this->iset, [=](RAJA::Index_type idx) {
-      tmin0.min(k * this->test_array[idx]);
-      tmin1.min(this->test_array[idx]);
-    });
-    ASSERT_EQ(Real_type(tmin0), Real_type(k * ref_min_val));
-    ASSERT_EQ(tmin1.get(), Real_type(-200.0));
-  }
-
-  tmin0.reset(1.0e+20);
-  tmin1.reset(-200);
-  tmin1.min(-100.0);
-
-  for (int k = 1; k <= loops; ++k) {
-    RAJA::forall<ISET_POLICY_T>(this->iset, [=](RAJA::Index_type idx) {
-      tmin0.min(k * this->test_array[idx]);
-      tmin1.min(this->test_array[idx]);
-    });
-    ASSERT_EQ(Real_type(tmin0), Real_type(k * ref_min_val));
-    ASSERT_EQ(tmin1.get(), Real_type(-200.0));
-  }
-}
-
-#if defined(RAJA_DEPRECATED_TESTS)
-TYPED_TEST(IndexSetReduce, ReduceMinLocTest)
-{
-  using ISET_POLICY_T = typename std::tuple_element<0, TypeParam>::type;
-  using REDUCE_POLICY_T = typename std::tuple_element<1, TypeParam>::type;
-
-  for (Index_type i = 0; i < this->alen; ++i) {
-    this->test_array[i] = fabs(this->in_array[i]);
-  }
-
-  const Index_type ref_min_indx =
-      Index_type(this->is_indices[this->is_indices.size() / 2]);
-  const Real_type ref_min_val = -100.0;
-
-  this->test_array[ref_min_indx] = ref_min_val;
-
-  ReduceMinLoc<REDUCE_POLICY_T, Real_type> tmin0(1.0e+20, -1);
-  ReduceMinLoc<REDUCE_POLICY_T, Real_type> tmin1(-200.0, -1);
-  tmin1.minloc(-100.0, -1);
-
-  forall<ISET_POLICY_T>(
-      this->iset, [=](Index_type idx) {
-        tmin0.minloc(1 * this->test_array[idx], idx);
-        tmin1.minloc(this->test_array[idx], idx);
-      });
-
-  ASSERT_EQ(tmin0.getLoc(), ref_min_indx);
-  ASSERT_EQ(tmin1.getLoc(), -1);
-  ASSERT_EQ(Real_type(tmin0), Real_type(1 * ref_min_val));
-  ASSERT_EQ(tmin1.get(), Real_type(-200.0));
-
-  forall<ISET_POLICY_T>(
-      this->iset, [=](Index_type idx) {
-        tmin0.minloc(2 * this->test_array[idx], idx);
-        tmin1.minloc(this->test_array[idx], idx);
-      });
-
-  ASSERT_EQ(Real_type(tmin0), Real_type(2 * ref_min_val));
-  ASSERT_EQ(tmin1.get(), Real_type(-200.0));
-  ASSERT_EQ(tmin0.getLoc(), ref_min_indx);
-  ASSERT_EQ(tmin1.getLoc(), -1);
-}
-#endif
-
-TYPED_TEST(IndexSetReduce, ReduceMaxTest)
-{
-  using ISET_POLICY_T = typename std::tuple_element<0, TypeParam>::type;
-  using REDUCE_POLICY_T = typename std::tuple_element<1, TypeParam>::type;
-
-  for (Index_type i = 0; i < this->alen; ++i) {
-    this->test_array[i] = -fabs(this->in_array[i]);
-  }
-
-  const Index_type ref_max_indx =
-      Index_type(this->is_indices[this->is_indices.size() / 2]);
-  const Real_type ref_max_val = 100.0;
-
-  this->test_array[ref_max_indx] = ref_max_val;
-
-  ReduceMax<REDUCE_POLICY_T, Real_type> tmax0(-1.0e+20);
-  ReduceMax<REDUCE_POLICY_T, Real_type> tmax1(200.0);
-
-  tmax1.max(100);
-
-  int loops = 2;
-
-  for (int k = 1; k <= loops; ++k) {
-
-    forall<ISET_POLICY_T>(this->iset, [=](Index_type idx) {
-      tmax0.max(k * this->test_array[idx]);
-      tmax1.max(this->test_array[idx]);
-    });
-
-    ASSERT_EQ(Real_type(tmax0), Real_type(k * ref_max_val));
-    ASSERT_EQ(tmax1.get(), Real_type(200.0));
-  }
-
-
-  // reset data and run again
-  tmax0.reset(-1.0e+20);
-  tmax1.reset(200.0);
-  tmax1.max(100);
-
-  for (int k = 1; k <= loops; ++k) {
-
-    forall<ISET_POLICY_T>(this->iset, [=](Index_type idx) {
-      tmax0.max(k * this->test_array[idx]);
-      tmax1.max(this->test_array[idx]);
-    });
-
-    ASSERT_EQ(Real_type(tmax0), Real_type(k * ref_max_val));
-    ASSERT_EQ(tmax1.get(), Real_type(200.0));
-  }
-}
-
-TYPED_TEST(IndexSetReduce, ReduceMaxLocTest)
-{
-  using ISET_POLICY_T = typename std::tuple_element<0, TypeParam>::type;
-  using REDUCE_POLICY_T = typename std::tuple_element<1, TypeParam>::type;
-
-  for (Index_type i = 0; i < this->alen; ++i) {
-    this->test_array[i] = -fabs(this->in_array[i]);
-  }
-
-  const Index_type ref_max_indx =
-      Index_type(this->is_indices[this->is_indices.size() / 2]);
-  const Real_type ref_max_val = 100.0;
-
-  this->test_array[ref_max_indx] = ref_max_val;
-
-  ReduceMaxLoc<REDUCE_POLICY_T, Real_type> tmax0;
-  ReduceMaxLoc<REDUCE_POLICY_T, Real_type> tmax1;
-
-  // Reset data
-  tmax0.reset(-1.0e+20, -1);
-  tmax1.reset(200.0, -1);
-  tmax1.maxloc(100.0, -1);
-
-  forall<ISET_POLICY_T>(this->iset, [=](Index_type idx) {
-    tmax0.maxloc(1 * this->test_array[idx], idx);
-    tmax1.maxloc(this->test_array[idx], idx);
-  });
-
-  ASSERT_EQ(tmax0.getLoc(), ref_max_indx);
-  ASSERT_EQ(tmax1.getLoc(), -1);
-  ASSERT_EQ(Real_type(tmax0), Real_type(1 * ref_max_val));
-  ASSERT_EQ(tmax1.get(), Real_type(200.0));
-
-  forall<ISET_POLICY_T>(this->iset, [=](Index_type idx) {
-    tmax0.maxloc(2 * this->test_array[idx], idx);
-    tmax1.maxloc(this->test_array[idx], idx);
-  });
-
-  ASSERT_EQ(Real_type(tmax0), Real_type(2 * ref_max_val));
-  ASSERT_EQ(tmax1.get(), Real_type(200.0));
-  ASSERT_EQ(tmax0.getLoc(), ref_max_indx);
-  ASSERT_EQ(tmax1.getLoc(), -1);
-}
-
-TYPED_TEST(IndexSetReduce, ReduceSumTest)
-{
-  using ISET_POLICY_T = typename std::tuple_element<0, TypeParam>::type;
-  using REDUCE_POLICY_T = typename std::tuple_element<1, TypeParam>::type;
-
-  Real_type ref_sum = 0.0;
-
-  for (size_t i = 0; i < this->is_indices.size(); ++i) {
-    ref_sum += this->in_array[this->is_indices[i]];
-  }
-
-  ReduceSum<REDUCE_POLICY_T, Real_type> tsum0(0.0);
-  ReduceSum<REDUCE_POLICY_T, Real_type> tsum1(5.0);
-  tsum1 += 0.0;
-
-  int loops = 2;
-
-  for (int k = 1; k <= loops; ++k) {
-
-    forall<ISET_POLICY_T>(this->iset, [=](Index_type idx) {
-      tsum0 += this->in_array[idx];
-      tsum1 += 1.0;
-    });
-
-    ASSERT_FLOAT_EQ(Real_type(tsum0), Real_type(k * ref_sum));
-    ASSERT_FLOAT_EQ(tsum1.get(), Real_type(k * this->iset.getLength() + 5.0));
-  }
-}
-
-
-//
-// Test to make sure the first min/max location is returned
-//
-TEST(Reduce, MinMaxLoc)
-{
-
-  const int N = 25;
-  double *A = new double[N];
-
-  // generate random numbers between [1,10]
-  for (int i = 0; i < N; ++i) {
-    A[i] = rand() % 10 + 1;
-  }
-
-  // Set min to be at index 0
-  A[0] = 1;
-  A[5] = 1;
-  RAJA::ReduceMinLoc<RAJA::seq_reduce, double> tmin(1000, 2);
-  RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0, N),
-                                [=](RAJA::Index_type id) {
-                                  tmin.minloc(A[id], id);
-                                });
-  ASSERT_EQ(tmin.getLoc(), 0);
-
-
-  // Set max to be at index 0
-  A[0] = 10;
-  A[5] = 10;
-  RAJA::ReduceMaxLoc<RAJA::seq_reduce, double> tmax(-1, 1);
-  RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0, N),
-                                [=](RAJA::Index_type id) {
-                                  tmax.maxloc(A[id], id);
-                                });
-  ASSERT_EQ(tmax.getLoc(), 0);
-
-  delete[] A;
-}
diff --git a/test/unit/cpu/test-scan.cpp b/test/unit/cpu/test-scan.cpp
deleted file mode 100644
index 03b6d7530a..0000000000
--- a/test/unit/cpu/test-scan.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for RAJA CPU scan operations.
-///
-
-#include <algorithm>
-#include <numeric>
-#include <tuple>
-#include <type_traits>
-
-#include <cstdlib>
-
-#include "RAJA/RAJA.hpp"
-
-#include "RAJA_gtest.hpp"
-#include "type_helper.hpp"
-
-const int N = 32000;
-
-// Unit Test Space Exploration
-
-using ExecTypes = std::tuple<RAJA::seq_exec
-#if defined(RAJA_ENABLE_OPENMP)
-                             ,
-                             RAJA::omp_parallel_for_exec
-#endif
-#if defined(RAJA_ENABLE_TBB)
-                             ,
-                             RAJA::tbb_for_exec
-#endif
-                             >;
-
-using ReduceTypes = std::tuple<RAJA::operators::plus<int>,
-                               RAJA::operators::plus<double>,
-                               RAJA::operators::minimum<float>,
-                               RAJA::operators::minimum<double>,
-                               RAJA::operators::maximum<int>,
-                               RAJA::operators::maximum<float>>;
-
-using CrossTypes =
-    ForTesting<typename types::product<ExecTypes, ReduceTypes>::type>;
-
-template <typename Tuple>
-struct Info {
-  using exec = typename std::tuple_element<0, Tuple>::type;
-  using function = typename std::tuple_element<1, Tuple>::type;
-  using data_type = typename function::result_type;
-};
-
-template <typename Tuple>
-struct Scan : public ::testing::Test {
-
-  using data_type = typename Info<Tuple>::data_type;
-  static data_type* data;
-
-  static void SetUpTestCase()
-  {
-    data = new data_type[N];
-    std::iota(data, data + N, 1);
-  }
-
-  static void TearDownTestCase() { delete[] data; }
-};
-
-template <typename Tuple>
-typename Info<Tuple>::data_type* Scan<Tuple>::data = nullptr;
-
-TYPED_TEST_SUITE_P(Scan);
-
-template <typename Function, typename T>
-::testing::AssertionResult check_inclusive(const T* actual, const T* original)
-{
-  T init = Function::identity();
-  for (int i = 0; i < N; ++i) {
-    init = Function()(init, *original);
-    if (*actual != init)
-      return ::testing::AssertionFailure()
-             << *actual << " != " << init << " (at index " << i << ")";
-    ++actual;
-    ++original;
-  }
-  return ::testing::AssertionSuccess();
-}
-
-template <typename Function, typename T>
-::testing::AssertionResult check_exclusive(const T* actual,
-                                           const T* original,
-                                           T init = Function::identity())
-{
-  for (int i = 0; i < N; ++i) {
-    if (*actual != init)
-      return ::testing::AssertionFailure()
-             << *actual << " != " << init << " (at index " << i << ")";
-    init = Function()(init, *original);
-    ++actual;
-    ++original;
-  }
-  return ::testing::AssertionSuccess();
-}
-
-TYPED_TEST_P(Scan, inclusive)
-{
-  using T = typename Info<TypeParam>::data_type;
-  using Function = typename Info<TypeParam>::function;
-
-  T* out = new T[N];
-
-  RAJA::inclusive_scan(typename Info<TypeParam>::exec(),
-                       Scan<TypeParam>::data,
-                       Scan<TypeParam>::data + N,
-                       out,
-                       Function{});
-
-  ASSERT_TRUE(check_inclusive<Function>(out, Scan<TypeParam>::data));
-  delete[] out;
-}
-
-TYPED_TEST_P(Scan, inclusive_inplace)
-{
-  using T = typename Info<TypeParam>::data_type;
-  using Function = typename Info<TypeParam>::function;
-
-  T* data = new T[N];
-  std::copy_n(Scan<TypeParam>::data, N, data);
-
-  RAJA::inclusive_scan_inplace(typename Info<TypeParam>::exec(),
-                               data,
-                               data + N,
-                               Function{});
-
-  ASSERT_TRUE(check_inclusive<Function>(data, Scan<TypeParam>::data));
-  delete[] data;
-}
-
-TYPED_TEST_P(Scan, exclusive)
-{
-  using T = typename Info<TypeParam>::data_type;
-  using Function = typename Info<TypeParam>::function;
-
-  T* out = new T[N];
-
-  RAJA::exclusive_scan(typename Info<TypeParam>::exec(),
-                       Scan<TypeParam>::data,
-                       Scan<TypeParam>::data + N,
-                       out,
-                       Function{});
-
-  ASSERT_TRUE(check_exclusive<Function>(out, Scan<TypeParam>::data));
-  delete[] out;
-}
-
-TYPED_TEST_P(Scan, exclusive_inplace)
-{
-  using T = typename Info<TypeParam>::data_type;
-  using Function = typename Info<TypeParam>::function;
-
-  T* data = new T[N];
-  std::copy_n(Scan<TypeParam>::data, N, data);
-
-  RAJA::exclusive_scan_inplace(typename Info<TypeParam>::exec(),
-                               data,
-                               data + N,
-                               Function{});
-
-  ASSERT_TRUE(check_exclusive<Function>(data, Scan<TypeParam>::data));
-  delete[] data;
-}
-
-TYPED_TEST_P(Scan, exclusive_offset)
-{
-  using T = typename Info<TypeParam>::data_type;
-  using Function = typename Info<TypeParam>::function;
-
-  T* out = new T[N];
-
-  RAJA::exclusive_scan(typename Info<TypeParam>::exec(),
-                       Scan<TypeParam>::data,
-                       Scan<TypeParam>::data + N,
-                       out,
-                       Function{},
-                       T(2));
-
-  ASSERT_TRUE(check_exclusive<Function>(out, Scan<TypeParam>::data, T(2)));
-  delete[] out;
-}
-
-TYPED_TEST_P(Scan, exclusive_inplace_offset)
-{
-  using T = typename Info<TypeParam>::data_type;
-  using Function = typename Info<TypeParam>::function;
-
-  T* data = new T[N];
-  std::copy_n(Scan<TypeParam>::data, N, data);
-
-  RAJA::exclusive_scan_inplace(
-      typename Info<TypeParam>::exec(), data, data + N, Function{}, T(2));
-
-  ASSERT_TRUE(check_exclusive<Function>(data, Scan<TypeParam>::data, T(2)));
-  delete[] data;
-}
-
-REGISTER_TYPED_TEST_SUITE_P(Scan,
-                           inclusive,
-                           inclusive_inplace,
-                           exclusive,
-                           exclusive_inplace,
-                           exclusive_offset,
-                           exclusive_inplace_offset);
-
-INSTANTIATE_TYPED_TEST_SUITE_P(ScanTests, Scan, CrossTypes);
diff --git a/test/unit/cpu/test-segments.cpp b/test/unit/cpu/test-segments.cpp
deleted file mode 100644
index 9d9e75633d..0000000000
--- a/test/unit/cpu/test-segments.cpp
+++ /dev/null
@@ -1,511 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for RAJA index set mechanics.
-///
-
-#include "RAJA/RAJA.hpp"
-#include "gtest/gtest.h"
-
-#include <iostream>
-
-namespace RAJA
-{
-template <typename T>
-void PrintTo(const TypedRangeSegment<T>& s, ::std::ostream* os)
-{
-  *os << '[' << (*s.begin()) << ',' << (*(s.end() - 1)) << ')';
-}
-
-template <typename T>
-void PrintTo(const TypedRangeStrideSegment<T>& s, ::std::ostream* os)
-{
-  *os << '[' << (*s.begin()) << ',' << (*(s.end() - 1)) << ')' << " by "
-      << (*(s.begin() + 1) - *s.begin());
-}
-
-template <typename T>
-void PrintTo(const TypedListSegment<T>& s, ::std::ostream* os)
-{
-  *os << "Address: " << &(*s.begin()) << "; Size: " << s.size()
-      << "; Ownership: "
-      << (s.getIndexOwnership() == RAJA::Owned ? "Owned" : "Unowned");
-}
-
-}  // namespace RAJA
-
-TEST(RangeStrideSegmentTest, sizes_no_roundoff)
-{
-  RAJA::RangeStrideSegment segment1(0, 20, 1);
-  ASSERT_EQ(segment1.size(), 20);
-
-  RAJA::RangeStrideSegment segment2(0, 20, 2);
-  ASSERT_EQ(segment2.size(), 10);
-
-  RAJA::RangeStrideSegment segment3(0, 20, 4);
-  ASSERT_EQ(segment3.size(), 5);
-
-  RAJA::RangeStrideSegment segment4(0, 20, 5);
-  ASSERT_EQ(segment4.size(), 4);
-
-  RAJA::RangeStrideSegment segment5(0, 20, 10);
-  ASSERT_EQ(segment5.size(), 2);
-
-  RAJA::RangeStrideSegment segment6(0, 20, 20);
-  ASSERT_EQ(segment6.size(), 1);
-}
-
-
-TEST(RangeStrideSegmentTest, sizes_roundoff1)
-{
-  RAJA::RangeStrideSegment segment2(0, 21, 2);
-  ASSERT_EQ(segment2.size(), 11);
-
-  RAJA::RangeStrideSegment segment3(0, 21, 4);
-  ASSERT_EQ(segment3.size(), 6);
-
-  RAJA::RangeStrideSegment segment4(0, 21, 5);
-  ASSERT_EQ(segment4.size(), 5);
-
-  RAJA::RangeStrideSegment segment5(0, 21, 10);
-  ASSERT_EQ(segment5.size(), 3);
-
-  RAJA::RangeStrideSegment segment6(0, 21, 20);
-  ASSERT_EQ(segment6.size(), 2);
-}
-
-
-TEST(RangeStrideSegmentTest, sizes_primes)
-{
-  RAJA::RangeStrideSegment segment1(0, 7, 3);  // should produce 0,3,6
-  ASSERT_EQ(segment1.size(), 3);
-
-  RAJA::RangeStrideSegment segment2(0, 13, 3);  // should produce 0,3,6,9,12
-  ASSERT_EQ(segment2.size(), 5);
-
-  RAJA::RangeStrideSegment segment3(0, 17, 5);  // should produce 0,5,10,15
-  ASSERT_EQ(segment3.size(), 4);
-}
-
-TEST(RangeStrideSegmentTest, basic_types)
-{
-  RAJA::TypedRangeStrideSegment<signed char> segment1(0, 31, 3);
-  ASSERT_EQ(segment1.size(), 11);
-
-  RAJA::TypedRangeStrideSegment<short> segment2(0, 31, 3);
-  ASSERT_EQ(segment2.size(), 11);
-
-  RAJA::TypedRangeStrideSegment<int> segment3(0, 31, 3);
-  ASSERT_EQ(segment3.size(), 11);
-
-  RAJA::TypedRangeStrideSegment<long> segment4(0, 31, 3);
-  ASSERT_EQ(segment3.size(), 11);
-
-  RAJA::TypedRangeStrideSegment<long long> segment5(0, 31, 3);
-  ASSERT_EQ(segment3.size(), 11);
-}
-
-RAJA_INDEX_VALUE(StrongType, "StrongType");
-
-TEST(RangeStrideSegmentTest, strongly_typed)
-{
-  RAJA::TypedRangeStrideSegment<StrongType> segment1(
-      0, 7, 3);  // should produce 0,3,6
-  ASSERT_EQ(segment1.size(), 3);
-
-  RAJA::TypedRangeStrideSegment<StrongType> segment2(
-      0, 13, 3);  // should produce 0,3,6,9,12
-  ASSERT_EQ(segment2.size(), 5);
-
-  RAJA::TypedRangeStrideSegment<StrongType> segment3(
-      0, 17, 5);  // should produce 0,5,10,15
-  ASSERT_EQ(segment3.size(), 4);
-
-  std::vector<int> values(7, 0);
-  RAJA::forall<RAJA::seq_exec>(segment1, [&](StrongType i) { values[*i] = 1; });
-
-  ASSERT_EQ(values[0], 1);
-  ASSERT_EQ(values[1], 0);
-  ASSERT_EQ(values[2], 0);
-  ASSERT_EQ(values[3], 1);
-  ASSERT_EQ(values[4], 0);
-  ASSERT_EQ(values[5], 0);
-  ASSERT_EQ(values[6], 1);
-}
-
-
-TEST(RangeStrideSegmentTest, sizes_reverse_no_roundoff)
-{
-  RAJA::RangeStrideSegment segment1(19, -1, -1);
-  ASSERT_EQ(segment1.size(), 20);
-
-  RAJA::RangeStrideSegment segment2(19, -1, -2);
-  ASSERT_EQ(segment2.size(), 10);
-
-  RAJA::RangeStrideSegment segment3(19, -1, -4);
-  ASSERT_EQ(segment3.size(), 5);
-
-  RAJA::RangeStrideSegment segment4(19, -1, -5);
-  ASSERT_EQ(segment4.size(), 4);
-
-  RAJA::RangeStrideSegment segment5(19, -1, -10);
-  ASSERT_EQ(segment5.size(), 2);
-
-  RAJA::RangeStrideSegment segment6(19, -1, -20);
-  ASSERT_EQ(segment6.size(), 1);
-}
-
-
-TEST(RangeStrideSegmentTest, sizes_reverse_roundoff1)
-{
-  RAJA::RangeStrideSegment segment2(20, -1, -2);
-  ASSERT_EQ(segment2.size(), 11);
-
-  RAJA::RangeStrideSegment segment3(20, -1, -4);
-  ASSERT_EQ(segment3.size(), 6);
-
-  RAJA::RangeStrideSegment segment4(20, -1, -5);
-  ASSERT_EQ(segment4.size(), 5);
-
-  RAJA::RangeStrideSegment segment5(20, -1, -10);
-  ASSERT_EQ(segment5.size(), 3);
-
-  RAJA::RangeStrideSegment segment6(20, -1, -20);
-  ASSERT_EQ(segment6.size(), 2);
-}
-
-
-TEST(RangeStrideSegmentTest, values_forward_stride1)
-{
-  RAJA::Index_type expected[] = {0, 1, 2, 3, 4, 5};
-  RAJA::RangeStrideSegment segment(0, 6, 1);
-
-  ASSERT_EQ(segment.size(), 6);
-
-  for (RAJA::Index_type i = 0; i < segment.size(); ++i) {
-    ASSERT_EQ(segment.begin()[i], expected[i]);
-  }
-
-  size_t j = 0;
-  for (auto i : segment) {
-    ASSERT_EQ(i, expected[j]);
-    ++j;
-  }
-}
-
-TEST(RangeStrideSegmentTest, values_forward_stride3)
-{
-  RAJA::Index_type expected[] = {0, 3, 6, 9, 12};
-  RAJA::RangeStrideSegment segment(0, 14, 3);
-
-  ASSERT_EQ(segment.size(), 5);
-
-  for (RAJA::Index_type i = 0; i < segment.size(); ++i) {
-    ASSERT_EQ(segment.begin()[i], expected[i]);
-  }
-
-  size_t j = 0;
-  for (auto i : segment) {
-    ASSERT_EQ(i, expected[j]);
-    ++j;
-  }
-}
-
-TEST(RangeStrideSegmentTest, values_reverse_stride1)
-{
-  RAJA::Index_type expected[] = {5, 4, 3, 2, 1, 0};
-  RAJA::RangeStrideSegment segment(5, -1, -1);
-
-  ASSERT_EQ(segment.size(), 6);
-
-  for (RAJA::Index_type i = 0; i < segment.size(); ++i) {
-    ASSERT_EQ(segment.begin()[i], expected[i]);
-  }
-
-  size_t j = 0;
-  for (auto i : segment) {
-    ASSERT_EQ(i, expected[j]);
-    ++j;
-  }
-}
-
-
-TEST(RangeStrideSegmentTest, values_reverse_stride1_negative)
-{
-  RAJA::Index_type expected[] = {-10, -11, -12, -13};
-  RAJA::RangeStrideSegment segment(-10, -14, -1);
-
-  ASSERT_EQ(segment.size(), 4);
-
-  for (RAJA::Index_type i = 0; i < segment.size(); ++i) {
-    ASSERT_EQ(segment.begin()[i], expected[i]);
-  }
-
-  size_t j = 0;
-  for (auto i : segment) {
-    ASSERT_EQ(i, expected[j]);
-    ++j;
-  }
-}
-
-
-TEST(RangeStrideSegmentTest, zero_size)
-{
-  RAJA::RangeStrideSegment segment(3, 2, 1);
-
-  ASSERT_EQ(segment.size(), 0);
-}
-
-TEST(RangeStrideSegmentTest, zero_size_reverse)
-{
-  RAJA::RangeStrideSegment segment(-3, 3, -1);
-
-  ASSERT_EQ(segment.size(), 0);
-}
-
-
-TEST(RangeStrideSegmentTest, forall_values_forward_stride3)
-{
-  RAJA::Index_type expected[] = {0, 3, 6, 9, 12};
-  RAJA::RangeStrideSegment segment(0, 14, 3);
-
-  ASSERT_EQ(segment.size(), 5);
-
-  for (RAJA::Index_type i = 0; i < segment.size(); ++i) {
-    ASSERT_EQ(segment.begin()[i], expected[i]);
-  }
-
-  size_t j = 0;
-
-
-  for (auto i = segment.begin(); i < segment.end(); ++i) {
-    ASSERT_EQ(*i, expected[j++]);
-  }
-
-  ASSERT_EQ((RAJA::Index_type)j, segment.size());
-
-
-  j = 0;
-
-  RAJA::forall<RAJA::seq_exec>(segment, [&](RAJA::Index_type i) {
-    ASSERT_EQ(i, expected[j++]);
-  });
-
-
-  ASSERT_EQ((RAJA::Index_type)j, segment.size());
-}
-
-
-TEST(RangeStrideSegmentTest, forall_values_reverse_stride5)
-{
-  RAJA::Index_type expected[] = {7, 2, -3, -8};
-  RAJA::RangeStrideSegment segment(7, -11, -5);
-
-  ASSERT_EQ(segment.size(), 4);
-
-  for (RAJA::Index_type i = 0; i < segment.size(); ++i) {
-    ASSERT_EQ(segment.begin()[i], expected[i]);
-  }
-
-  size_t j = 0;
-
-  for (auto i = segment.begin(); i < segment.end(); ++i) {
-    ASSERT_EQ(*i, expected[j++]);
-  }
-
-  ASSERT_EQ((RAJA::Index_type)j, segment.size());
-
-  j = 0;
-
-  RAJA::forall<RAJA::seq_exec>(segment, [&](RAJA::Index_type i) {
-    ASSERT_EQ(i, expected[j++]);
-  });
-
-  ASSERT_EQ((RAJA::Index_type)j, segment.size());
-}
-
-
-TEST(RangeStrideSegmentTest, iterator_begin_end)
-{
-  RAJA::RangeStrideSegment segment(7, -11, -5);
-
-  auto begin1 = segment.begin();
-  auto begin2 = std::begin(segment);
-  ASSERT_EQ(begin1, begin2);
-
-  auto end1 = segment.end();
-  auto end2 = std::end(segment);
-  ASSERT_EQ(end1, end2);
-}
-
-
-TEST(RangeStrideSegmentTest, iterator_distance)
-{
-  {
-    RAJA::RangeStrideSegment segment1(0, 10, 1);
-    ASSERT_EQ(std::distance(std::begin(segment1), std::end(segment1)), 10);
-  }
-
-  {
-    RAJA::RangeStrideSegment segment1(10, 20, 1);
-    ASSERT_EQ(std::distance(std::begin(segment1), std::end(segment1)), 10);
-  }
-
-  {
-    RAJA::RangeStrideSegment segment1(0, 5, 2);
-    ASSERT_EQ(std::distance(std::begin(segment1), std::end(segment1)), 3);
-  }
-
-  {
-    RAJA::RangeStrideSegment segment1(10, 20, 2);
-    ASSERT_EQ(std::distance(std::begin(segment1), std::end(segment1)), 5);
-  }
-
-  {
-    RAJA::RangeStrideSegment segment1(20, 10, -2);
-    ASSERT_EQ(std::distance(std::begin(segment1), std::end(segment1)), 5);
-  }
-
-  {
-    RAJA::RangeStrideSegment segment1(-10, 10, 3);
-    ASSERT_EQ(std::distance(std::begin(segment1), std::end(segment1)), 7);
-  }
-
-
-  {
-    RAJA::RangeStrideSegment segment1(10, -10, -7);
-    ASSERT_EQ(std::distance(std::begin(segment1), std::end(segment1)), 3);
-  }
-}
-
-TEST(SegmentTest, constructors)
-{
-  {
-    RAJA::RangeStrideSegment first(0, 10, 2);
-    RAJA::RangeStrideSegment copied(first);
-    ASSERT_EQ(first, copied);
-    RAJA::RangeStrideSegment moved(std::move(first));
-    ASSERT_EQ(moved, copied);
-  }
-
-  {
-    RAJA::RangeSegment first(0, 10);
-    RAJA::RangeSegment copied(first);
-    ASSERT_EQ(first, copied);
-    RAJA::RangeSegment moved(std::move(first));
-    ASSERT_EQ(moved, copied);
-  }
-
-  {
-    RAJA::ListSegment first(RAJA::RangeSegment(0, 10));
-    ASSERT_EQ(RAJA::Owned, first.getIndexOwnership());
-
-    RAJA::ListSegment copied(first);
-    ASSERT_EQ(RAJA::Owned, copied.getIndexOwnership());
-
-    ASSERT_EQ(first, copied);
-    RAJA::ListSegment moved(std::move(first));
-    ASSERT_EQ(moved, copied);
-
-    RAJA::ListSegment empty(nullptr, 100);
-    RAJA::ListSegment empty2(first.begin(), -5);
-    ASSERT_EQ(empty, empty2);
-  }
-}
-
-TEST(SegmentTest, assignments)
-{
-  {
-    auto r = RAJA::RangeSegment(RAJA::Index_type(), 5);
-    RAJA::RangeSegment seg1 = r;
-    ASSERT_EQ(r, seg1);
-    RAJA::RangeSegment seg2 = std::move(r);
-    ASSERT_EQ(seg2, seg1);
-  }
-  {
-    auto r = RAJA::make_strided_range(RAJA::Index_type(), 5, 3);
-    RAJA::RangeStrideSegment seg1 = r;
-    ASSERT_EQ(r, seg1);
-    RAJA::RangeStrideSegment seg2 = std::move(r);
-    ASSERT_EQ(seg2, seg1);
-  }
-  {
-    RAJA::Index_type vals[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-    RAJA::ListSegment r(vals, 5, RAJA::Unowned);
-    ASSERT_EQ(RAJA::Unowned, r.getIndexOwnership());
-
-    RAJA::ListSegment seg1 = r;
-    ASSERT_EQ(r, seg1);
-    RAJA::ListSegment seg2 = std::move(r);
-    ASSERT_EQ(seg2, seg1);
-  }
-}
-
-TEST(SegmentTest, swaps)
-{
-  {
-    RAJA::RangeSegment r1(0, 5);
-    RAJA::RangeSegment r2(1, 6);
-    RAJA::RangeSegment r3(r1);
-    RAJA::RangeSegment r4(r2);
-    std::swap(r1, r2);
-    ASSERT_EQ(r1, r4);
-    ASSERT_EQ(r2, r3);
-  }
-  {
-    RAJA::RangeStrideSegment r1(0, 5, 2);
-    RAJA::RangeStrideSegment r2(1, 6, 1);
-    RAJA::RangeStrideSegment r3(r1);
-    RAJA::RangeStrideSegment r4(r2);
-    std::swap(r1, r2);
-    ASSERT_EQ(r1, r4);
-    ASSERT_EQ(r2, r3);
-  }
-  {
-    RAJA::Index_type vals[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-    RAJA::ListSegment r1(vals, 5, RAJA::Unowned);
-    RAJA::ListSegment r2(vals + 5, 5, RAJA::Unowned);
-    ASSERT_NE(r1, r2);
-    RAJA::ListSegment r3(r1);
-    RAJA::ListSegment r4(r2);
-    std::swap(r1, r2);
-    ASSERT_EQ(r1, r4);
-    ASSERT_EQ(r2, r3);
-  }
-}
-
-TEST(SegmentTest, iterators)
-{
-  {
-    RAJA::RangeSegment r1(0, 100);
-    ASSERT_EQ(0, *r1.begin());
-    ASSERT_EQ(99, *(--r1.end()));
-    ASSERT_EQ(100, r1.end() - r1.begin());
-    ASSERT_EQ(100, std::distance(r1.begin(), r1.end()));
-    ASSERT_EQ(100, r1.size());
-  }
-  {
-    RAJA::RangeStrideSegment r1(0, 100, 4);
-    ASSERT_EQ(0, *r1.begin());
-    ASSERT_EQ(96, *(--r1.end()));
-    ASSERT_EQ(25, r1.end() - r1.begin());
-    ASSERT_EQ(25, std::distance(r1.begin(), r1.end()));
-    ASSERT_EQ(25, r1.size());
-  }
-  {
-    RAJA::Index_type data[5] = {1, 3, 5, 7, 9};
-    RAJA::ListSegment r1(data, 5);
-    ASSERT_EQ(1, *r1.begin());
-    ASSERT_EQ(9, *(r1.end() - 1));
-    ASSERT_EQ(5, r1.end() - r1.begin());
-    ASSERT_EQ(5, std::distance(r1.begin(), r1.end()));
-    ASSERT_EQ(5, r1.size());
-    ASSERT_FALSE(r1.indicesEqual(nullptr, 10));
-    ASSERT_FALSE(r1.indicesEqual(&(*r1.begin()) + 1, r1.size()));
-  }
-}
diff --git a/test/unit/cuda/test-forall-view.cpp b/test/unit/cuda/test-forall-view.cpp
deleted file mode 100644
index 6067a71796..0000000000
--- a/test/unit/cuda/test-forall-view.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#include <cstdlib>
-
-#include <string>
-
-#include "RAJA/RAJA.hpp"
-#include "RAJA_gtest.hpp"
-
-using namespace RAJA;
-using namespace std;
-
-const size_t block_size = 256;
-
-static double* arr_h;
-static double* arr_d;
-static Index_type alen;
-static double test_val;
-
-struct ForallViewCUDA : ::testing::Test {
-  virtual void SetUp()
-  {
-    alen = 100000;
-    test_val = 0.123;
-
-    arr_h = (double*)allocate_aligned(DATA_ALIGN, alen * sizeof(double));
-
-    for (Index_type i = 0; i < alen; ++i) {
-      arr_h[i] = double(rand() % 65536);
-    }
-
-    cudaErrchk(cudaMalloc((void**)&arr_d, alen * sizeof(double)));
-
-    cudaErrchk(cudaMemcpy(arr_d, arr_h, alen * sizeof(double), cudaMemcpyHostToDevice));
-  }
-
-  virtual void TearDown()
-  {
-    free_aligned(arr_h);
-    cudaErrchk(cudaFree(arr_d));
-  }
-};
-
-GPU_TEST_F(ForallViewCUDA, ForallViewLayout)
-{
-  const Index_type alen = ::alen;
-  double* arr_h = ::arr_h;
-  double* arr_d = ::arr_d;
-  double test_val = ::test_val;
-
-  const RAJA::Layout<1> my_layout(alen);
-  RAJA::View<double, RAJA::Layout<1>> view(arr_d, my_layout);
-
-  forall<RAJA::cuda_exec<block_size>>(RAJA::RangeSegment(0, alen),
-                                      [=] RAJA_HOST_DEVICE(Index_type i) {
-                                        view(i) = test_val;
-                                      });
-
-  cudaErrchk(cudaMemcpy(arr_h, arr_d, alen * sizeof(double), cudaMemcpyDeviceToHost));
-
-  for (Index_type i = 0; i < alen; ++i) {
-    EXPECT_EQ(arr_h[i], test_val);
-  }
-}
-
-GPU_TEST_F(ForallViewCUDA, ForallViewOffsetLayout)
-{
-  const Index_type alen = ::alen;
-  double* arr_h = ::arr_h;
-  double* arr_d = ::arr_d;
-  double test_val = ::test_val;
-
-  RAJA::OffsetLayout<1> my_layout =
-      RAJA::make_offset_layout<1>({{1}}, {{alen + 1}});
-  RAJA::View<double, RAJA::OffsetLayout<1>> view(arr_d, my_layout);
-
-  forall<RAJA::cuda_exec<block_size>>(RAJA::RangeSegment(1, alen + 1),
-                                      [=] RAJA_DEVICE(Index_type i) {
-                                        view(i) = test_val;
-                                      });
-
-  cudaErrchk(cudaMemcpy(arr_h, arr_d, alen * sizeof(double), cudaMemcpyDeviceToHost));
-
-  for (Index_type i = 0; i < alen; ++i) {
-    EXPECT_EQ(arr_h[i], test_val);
-  }
-}
-
-GPU_TEST_F(ForallViewCUDA, ForallViewOffsetLayout2D)
-{
-
-  using RAJA::Index_type;
-  Index_type* box;
-  const Index_type DIM = 2;
-  const Index_type N = 2;
-  const Index_type boxSize = (N + 2) * (N + 2);
-
-  cudaErrchk(cudaMallocManaged((void**)&box,
-                    boxSize * sizeof(Index_type),
-                    cudaMemAttachGlobal));
-
-  RAJA::OffsetLayout<DIM> layout =
-      RAJA::make_offset_layout<DIM>({{-1, -1}}, {{2, 2}});
-  RAJA::View<Index_type, RAJA::OffsetLayout<DIM>> boxview(box, layout);
-
-  forall<RAJA::cuda_exec<256>>(RAJA::RangeSegment(0, N * N),
-                               [=] RAJA_HOST_DEVICE(Index_type i) {
-                                 const int col = i % N;
-                                 const int row = i / N;
-                                 boxview(row, col) = 1000;
-                               });
-
-
-  for (Index_type row = 0; row < N; ++row) {
-    for (Index_type col = 0; col < N; ++col) {
-      int id = (col + 1) + (N + 2) * (row + 1);
-      EXPECT_EQ(box[id], 1000);
-    }
-  }
-
-  cudaErrchk(cudaFree(box));
-}
diff --git a/test/unit/cuda/test-forall.cpp b/test/unit/cuda/test-forall.cpp
deleted file mode 100644
index 84dd296b2f..0000000000
--- a/test/unit/cuda/test-forall.cpp
+++ /dev/null
@@ -1,288 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for RAJA GPU forall traversals.
-///
-
-#include <cfloat>
-#include <cstdio>
-#include <iostream>
-#include <random>
-#include <string>
-#include <vector>
-
-#include "RAJA/RAJA.hpp"
-
-#include "RAJA_gtest.hpp"
-
-using UnitIndexSet = RAJA::TypedIndexSet<RAJA::RangeSegment,
-                                         RAJA::ListSegment,
-                                         RAJA::RangeStrideSegment>;
-
-const size_t block_size = 256;
-
-static UnitIndexSet iset;
-static RAJA::Index_type array_length;
-static RAJA::RAJAVec<RAJA::Index_type> is_indices;
-static RAJA::Real_ptr parent, ref_array, test_array;
-
-struct ForallCUDA : ::testing::Test {
-  virtual void SetUp()
-  {
-    using namespace RAJA;
-    using namespace std;
-
-    //
-    //  Build vector of integers for creating List segments.
-    //
-    default_random_engine gen;
-    uniform_real_distribution<double> dist(0.0, 1.0);
-
-    RAJAVec<Index_type> lindices;
-    Index_type idx = 0;
-    while (lindices.size() < 10000) {
-      double dval = dist(gen);
-      if (dval > 0.3) {
-        lindices.push_back(idx);
-      }
-      idx++;
-    }
-
-    //
-    // Construct index set with mix of Range and List segments.
-    //
-    Index_type rbeg;
-    Index_type rend;
-    Index_type last_idx;
-    Index_type lseg_len = lindices.size();
-    RAJAVec<Index_type> lseg(lseg_len);
-    std::vector<Index_type> lseg_vec(lseg_len);
-
-    // Create empty Range segment
-    rbeg = 1;
-    rend = 1;
-    iset.push_back(RangeSegment(rbeg, rend));
-    last_idx = rend;
-
-    // Create Range segment
-    rbeg = 1;
-    rend = 15782;
-    iset.push_back(RangeSegment(rbeg, rend));
-    last_idx = rend;
-
-    // Create List segment
-    for (Index_type i = 0; i < lseg_len; ++i) {
-      lseg[i] = lindices[i] + last_idx + 3;
-    }
-    iset.push_back(ListSegment(&lseg[0], lseg_len));
-    last_idx = lseg[lseg_len - 1];
-
-    // Create List segment using alternate ctor
-    for (Index_type i = 0; i < lseg_len; ++i) {
-      lseg_vec[i] = lindices[i] + last_idx + 3;
-    }
-    iset.push_back(ListSegment(lseg_vec));
-    last_idx = lseg_vec[lseg_len - 1];
-
-    // Create Range segment
-    rbeg = last_idx + 16;
-    rend = rbeg + 20490;
-    iset.push_back(RangeSegment(rbeg, rend));
-    last_idx = rend;
-
-    // Create Range segment
-    rbeg = last_idx + 4;
-    rend = rbeg + 27595;
-    iset.push_back(RangeSegment(rbeg, rend));
-    last_idx = rend;
-
-    // Create List segment
-    for (Index_type i = 0; i < lseg_len; ++i) {
-      lseg[i] = lindices[i] + last_idx + 5;
-    }
-    iset.push_back(ListSegment(&lseg[0], lseg_len));
-    last_idx = lseg[lseg_len - 1];
-
-    // Create Range segment
-    rbeg = last_idx + 1;
-    rend = rbeg + 32003;
-    iset.push_back(RangeSegment(rbeg, rend));
-    last_idx = rend;
-
-    // Create List segment using alternate ctor
-    for (Index_type i = 0; i < lseg_len; ++i) {
-      lseg_vec[i] = lindices[i] + last_idx + 7;
-    }
-    iset.push_back(ListSegment(lseg_vec));
-    last_idx = lseg_vec[lseg_len - 1];
-
-    //
-    // Collect actual indices in index set for testing.
-    //
-    getIndices(is_indices, iset);
-
-    ///////////////////////////////////////////////////////////////////////////
-    //
-    // Set up data and reference solution for tests...
-    //
-    ///////////////////////////////////////////////////////////////////////////
-
-    array_length = last_idx + 1;
-    //
-    // Allocate and initialize managed data arrays.
-    //
-
-    Index_type max_size = (array_length > Index_type(is_indices.size()))
-                              ? array_length
-                              : is_indices.size();
-
-    cudaErrchk(cudaMallocManaged((void **)&parent,
-                      sizeof(Real_type) * max_size,
-                      cudaMemAttachGlobal));
-    for (Index_type i = 0; i < max_size; ++i) {
-      parent[i] = static_cast<Real_type>(rand() % 65536);
-    }
-
-    cudaErrchk(cudaMallocManaged((void **)&test_array,
-                      sizeof(Real_type) * max_size,
-                      cudaMemAttachGlobal));
-    cudaErrchk(cudaMemset(test_array, 0, sizeof(Real_type) * max_size));
-
-    cudaErrchk(cudaMallocManaged((void **)&ref_array,
-                      sizeof(Real_type) * max_size,
-                      cudaMemAttachGlobal));
-    cudaErrchk(cudaMemset(ref_array, 0, sizeof(Real_type) * max_size));
-  }
-
-  virtual void TearDown()
-  {
-    cudaErrchk(cudaFree(::test_array));
-    cudaErrchk(cudaFree(::ref_array));
-    cudaErrchk(cudaFree(::parent));
-    ::iset = UnitIndexSet();
-    ::is_indices = RAJA::RAJAVec<RAJA::Index_type>();
-  }
-};
-
-///
-/// Run traversal with simple range-based iteration
-///
-GPU_TEST_F(ForallCUDA, forall_range)
-{
-  RAJA::Real_ptr parent = ::parent;
-  RAJA::Real_ptr test_array = ::test_array;
-  RAJA::Real_ptr ref_array = ::ref_array;
-
-  cudaErrchk(cudaMemset(test_array, 0, sizeof(RAJA::Real_type) * array_length));
-  cudaErrchk(cudaMemset(ref_array, 0, sizeof(RAJA::Real_type) * array_length));
-
-  for (RAJA::Index_type i = 0; i < array_length; ++i) {
-    ref_array[i] = parent[i] * parent[i];
-  }
-
-  RAJA::forall<RAJA::cuda_exec<block_size>>(RAJA::RangeSegment(0, array_length),
-                                            [=] RAJA_HOST_DEVICE(
-                                                RAJA::Index_type idx) {
-                                              test_array[idx] =
-                                                  parent[idx] * parent[idx];
-                                            });
-
-  for (RAJA::Index_type i = 0; i < array_length; ++i) {
-    ASSERT_FLOAT_EQ(ref_array[i], test_array[i]);
-  }
-}
-
-///
-/// Run range Icount test in its simplest form for sanity check
-///
-GPU_TEST_F(ForallCUDA, forall_icount_range)
-{
-  RAJA::Real_ptr parent = ::parent;
-  RAJA::Real_ptr test_array = ::test_array;
-  RAJA::Real_ptr ref_array = ::ref_array;
-
-  cudaErrchk(cudaMemset(test_array, 0, sizeof(RAJA::Real_type) * array_length));
-  cudaErrchk(cudaMemset(ref_array, 0, sizeof(RAJA::Real_type) * array_length));
-
-  //
-  // Generate reference result to check correctness.
-  // Note: Reference does not use RAJA!!!
-  //
-  for (RAJA::Index_type i = 0; i < array_length; ++i) {
-    ref_array[i] = parent[i] * parent[i];
-  }
-
-  RAJA::forall_Icount<RAJA::cuda_exec<block_size>>(
-      RAJA::RangeSegment(0, array_length),
-      0,
-      [=] RAJA_DEVICE(RAJA::Index_type icount, RAJA::Index_type idx) {
-        test_array[icount] = parent[idx] * parent[idx];
-      });
-
-  for (RAJA::Index_type i = 0; i < array_length; ++i) {
-    ASSERT_FLOAT_EQ(ref_array[i], test_array[i]);
-  }
-}
-
-///
-/// Run traversal test with IndexSet containing multiple segments.
-///
-GPU_TEST_F(ForallCUDA, forall_indexset)
-{
-  RAJA::Real_ptr parent = ::parent;
-  RAJA::Real_ptr test_array = ::test_array;
-  RAJA::Real_ptr ref_array = ::ref_array;
-
-  cudaErrchk(cudaMemset(test_array, 0, sizeof(RAJA::Real_type) * array_length));
-  cudaErrchk(cudaMemset(ref_array, 0, sizeof(RAJA::Real_type) * array_length));
-
-  //
-  // Generate reference result to check correctness.
-  // Note: Reference does not use RAJA!!!
-  //
-  for (decltype(is_indices.size()) i = 0; i < is_indices.size(); ++i) {
-    ref_array[is_indices[i]] = parent[is_indices[i]] * parent[is_indices[i]];
-  }
-
-  RAJA::forall<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<block_size>>>(
-      iset, [=] RAJA_HOST_DEVICE(RAJA::Index_type idx) {
-        test_array[idx] = parent[idx] * parent[idx];
-      });
-
-  for (RAJA::Index_type i = 0; i < array_length; ++i) {
-    ASSERT_FLOAT_EQ(ref_array[i], test_array[i]);
-  }
-}
-
-///
-/// Run Icount test with IndexSet containing multiple segments.
-///
-GPU_TEST_F(ForallCUDA, forall_icount_indexset)
-{
-  RAJA::Real_ptr parent = ::parent;
-  RAJA::Real_ptr test_array = ::test_array;
-  RAJA::Real_ptr ref_array = ::ref_array;
-
-  cudaErrchk(cudaMemset(test_array, 0, sizeof(RAJA::Real_type) * array_length));
-  cudaErrchk(cudaMemset(ref_array, 0, sizeof(RAJA::Real_type) * array_length));
-
-  RAJA::Index_type test_alen = is_indices.size();
-  for (RAJA::Index_type i = 0; i < test_alen; ++i) {
-    ref_array[i] = parent[is_indices[i]] * parent[is_indices[i]];
-  }
-
-  RAJA::forall_Icount<
-      RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<block_size>>>(
-      iset, [=] RAJA_DEVICE(RAJA::Index_type icount, RAJA::Index_type idx) {
-        test_array[icount] = parent[idx] * parent[idx];
-      });
-
-  for (RAJA::Index_type i = 0; i < array_length; ++i) {
-    ASSERT_FLOAT_EQ(ref_array[i], test_array[i]);
-  }
-}
diff --git a/test/unit/cuda/test-reduce-loc.cpp b/test/unit/cuda/test-reduce-loc.cpp
deleted file mode 100644
index 50a63e1c1c..0000000000
--- a/test/unit/cuda/test-reduce-loc.cpp
+++ /dev/null
@@ -1,337 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for RAJA GPU min-loc reductions.
-///
-
-#include <cfloat>
-#include <cstdio>
-#include <iostream>
-#include <random>
-#include <string>
-
-#include "RAJA/RAJA.hpp"
-#include "RAJA_gtest.hpp"
-
-using namespace RAJA;
-
-using UnitIndexSet = RAJA::TypedIndexSet<RAJA::RangeSegment,
-                                         RAJA::ListSegment,
-                                         RAJA::RangeStrideSegment>;
-
-constexpr const RAJA::Index_type TEST_VEC_LEN = 1024 * 8;
-
-static const int test_repeat = 10;
-static const size_t block_size = 256;
-
-// for setting random values in arrays
-static std::random_device rd;
-static std::mt19937 mt(rd());
-static std::uniform_real_distribution<double> dist(-10, 10);
-static std::uniform_real_distribution<double> dist2(0, TEST_VEC_LEN - 1);
-
-struct Index {
-  RAJA::Index_type idx;
-  RAJA_HOST_DEVICE constexpr Index() : idx(-1) {}
-  RAJA_HOST_DEVICE constexpr Index(RAJA::Index_type idx) : idx(idx) {}
-  RAJA_HOST_DEVICE constexpr bool operator==(const Index& rhs) const { return (idx == rhs.idx); }
-};
-
-template <typename T>
-struct LocCompare {
-   RAJA_HOST_DEVICE constexpr T defaultVal() const { return T(); }
-   RAJA_HOST_DEVICE constexpr T getVal(int idx) const { return T(idx); }
-};
-
-template <>
-struct LocCompare<Index_type> {
-   RAJA_HOST_DEVICE constexpr Index_type defaultVal() const { return -1; }
-   RAJA_HOST_DEVICE constexpr Index_type getVal(int idx) const { return idx; }
-};
-
-template <typename T>
-struct LocConvert {
-   RAJA_HOST_DEVICE constexpr T get(const T& idx) const { return idx; }
-};
-
-template <>
-struct LocConvert<Index> {
-   RAJA_HOST_DEVICE constexpr Index_type get(const Index& i) const { return i.idx; }
-};
-
-static void reset(double* ptr, long length, double def)
-{
-  for (long i = 0; i < length; ++i) {
-    ptr[i] = def;
-  }
-}
-
-template <typename T>
-struct reduce_applier;
-template <typename T, typename U, typename Index>
-struct reduce_applier<ReduceMinLoc<T, U, Index>> {
-  using IndexType = Index;
-  static U def() { return DBL_MAX; }
-  static U big() { return -500.0; }
-  template <bool B>
-  static void updatedvalue(U* dvalue,
-                           reduce::detail::ValueLoc<U, Index, B>& randval,
-                           reduce::detail::ValueLoc<U, Index, B>& dcurrent)
-  {
-    if (dvalue[LocConvert<IndexType>().get(randval.loc)] > randval.val) {
-      dvalue[LocConvert<IndexType>().get(randval.loc)] = randval.val;
-      apply(dcurrent, randval);
-    }
-  }
-  RAJA_HOST_DEVICE static void apply(ReduceMinLoc<T, U, Index> const& r,
-                                     U const& val,
-                                     Index i)
-  {
-    r.minloc(val, i);
-  }
-  template <bool B>
-  RAJA_HOST_DEVICE static void apply(reduce::detail::ValueLoc<U, Index, B>& l,
-                                     reduce::detail::ValueLoc<U, Index, B> const& r)
-  {
-    l = l > r ? r : l;
-  }
-  template <bool B>
-  static void cmp(ReduceMinLoc<T, U, Index>& l,
-                  reduce::detail::ValueLoc<U, Index, B> const& r)
-  {
-    ASSERT_FLOAT_EQ(r.val, l.get());
-    ASSERT_EQ(r.loc, l.getLoc());
-  }
-};
-template <typename T, typename U, typename Index>
-struct reduce_applier<ReduceMaxLoc<T, U, Index>> {
-  using IndexType = Index;
-  static U def() { return -DBL_MAX; }
-  static U big() { return 500.0; }
-  template <bool B>
-  static void updatedvalue(U* dvalue,
-                           reduce::detail::ValueLoc<U, Index, B>& randval,
-                           reduce::detail::ValueLoc<U, Index, B>& dcurrent)
-  {
-    if (randval.val > dvalue[LocConvert<IndexType>().get(randval.loc)]) {
-      dvalue[LocConvert<IndexType>().get(randval.loc)] = randval.val;
-      apply(dcurrent, randval);
-    }
-  }
-  RAJA_HOST_DEVICE static void apply(ReduceMaxLoc<T, U, Index> const& r,
-                                     U const& val,
-                                     Index i)
-  {
-    r.maxloc(val, i);
-  }
-  template <bool B>
-  RAJA_HOST_DEVICE static void apply(reduce::detail::ValueLoc<U, Index, B>& l,
-                                     reduce::detail::ValueLoc<U, Index, B> const& r)
-  {
-    l = l > r ? l : r;
-  }
-  template <bool B>
-  static void cmp(ReduceMaxLoc<T, U, Index>& l,
-                  reduce::detail::ValueLoc<U, Index, B> const& r)
-  {
-    ASSERT_FLOAT_EQ(r.val, l.get());
-    ASSERT_EQ(r.loc, l.getLoc());
-  }
-};
-
-template <typename Reducer>
-class ReduceCUDA : public ::testing::Test
-{
-  using applier = reduce_applier<Reducer>;
-
-public:
-  static double* dvalue;
-  static void SetUpTestCase()
-  {
-    cudaErrchk(cudaMallocManaged((void**)&dvalue,
-                      sizeof(double) * TEST_VEC_LEN,
-                      cudaMemAttachGlobal));
-    reset(dvalue, TEST_VEC_LEN, applier::def());
-  }
-  static void TearDownTestCase() { cudaErrchk(cudaFree(dvalue)); }
-};
-
-template <typename Reducer>
-double* ReduceCUDA<Reducer>::dvalue = nullptr;
-
-
-TYPED_TEST_SUITE_P(ReduceCUDA);
-
-GPU_TYPED_TEST_P(ReduceCUDA, generic)
-{
-
-  using applier = reduce_applier<TypeParam>;
-  using IndexType = typename applier::IndexType;
-  using reducer = ReduceCUDA<TypeParam>;
-  double* dvalue = reducer::dvalue;
-  reset(dvalue, TEST_VEC_LEN, applier::def());
-
-  reduce::detail::ValueLoc<double, IndexType> dcurrent(applier::def(), LocCompare<IndexType>().defaultVal());
-
-  for (int tcount = 0; tcount < test_repeat; ++tcount) {
-
-
-    TypeParam dmin0(applier::def(), -1);
-    TypeParam dmin1(applier::def(), -1);
-    TypeParam dmin2(applier::big(), -1);
-
-    int loops = 16;
-    for (int k = 0; k < loops; k++) {
-
-      double droll = dist(mt);
-      int index = int(dist2(mt));
-      reduce::detail::ValueLoc<double, IndexType> randval(droll, LocCompare<IndexType>().getVal(index));
-      applier::updatedvalue(dvalue, randval, dcurrent);
-
-      forall<cuda_exec<block_size>>(RAJA::RangeSegment(0, TEST_VEC_LEN),
-                                    [=] RAJA_DEVICE(int i) {
-                                      applier::apply(dmin0, dvalue[i], IndexType(i));
-                                      applier::apply(dmin1, 2 * dvalue[i], IndexType(i));
-                                      applier::apply(dmin2, dvalue[i], IndexType(i));
-                                    });
-
-      applier::cmp(dmin0, dcurrent);
-
-      ASSERT_FLOAT_EQ(dcurrent.val * 2, dmin1.get());
-      ASSERT_EQ(dcurrent.getLoc(), dmin1.getLoc());
-      ASSERT_FLOAT_EQ(applier::big(), dmin2.get());
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////
-
-//
-// test 2 runs 2 reductions over complete array using an indexset
-//        with two range segments to check reduction object state
-//        is maintained properly across kernel invocations.
-//
-GPU_TYPED_TEST_P(ReduceCUDA, indexset_align)
-{
-
-  using applier = reduce_applier<TypeParam>;
-  using IndexType = typename applier::IndexType;
-  double* dvalue = ReduceCUDA<TypeParam>::dvalue;
-
-  reset(dvalue, TEST_VEC_LEN, applier::def());
-
-  reduce::detail::ValueLoc<double, IndexType> dcurrent(applier::def(), LocCompare<IndexType>().defaultVal());
-
-  for (int tcount = 0; tcount < test_repeat; ++tcount) {
-
-    RangeSegment seg0(0, TEST_VEC_LEN / 2);
-    RangeSegment seg1(TEST_VEC_LEN / 2, TEST_VEC_LEN);
-
-    UnitIndexSet iset;
-    iset.push_back(seg0);
-    iset.push_back(seg1);
-
-    TypeParam dmin0(applier::def(), -1);
-    TypeParam dmin1(applier::def(), -1);
-
-    double droll = dist(mt);
-    int index = int(dist2(mt));
-    reduce::detail::ValueLoc<double, IndexType> randval(droll, LocCompare<IndexType>().getVal(index));
-    applier::updatedvalue(dvalue, randval, dcurrent);
-
-    forall<ExecPolicy<seq_segit, cuda_exec<block_size>>>(
-        iset, [=] RAJA_HOST_DEVICE(int i) {
-          applier::apply(dmin0, dvalue[i], IndexType(i));
-          applier::apply(dmin1, 2 * dvalue[i], IndexType(i));
-        });
-
-    ASSERT_FLOAT_EQ(double(dcurrent), double(dmin0));
-    ASSERT_FLOAT_EQ(2 * double(dcurrent), double(dmin1));
-    ASSERT_EQ(dcurrent.getLoc(), dmin0.getLoc());
-    ASSERT_EQ(dcurrent.getLoc(), dmin1.getLoc());
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////
-
-//
-// test 3 runs 2 reductions over disjoint chunks of the array using
-//        an indexset with four range segments not aligned with
-//        warp boundaries to check that reduction mechanics don't
-//        depend on any sort of special indexing.
-//
-GPU_TYPED_TEST_P(ReduceCUDA, indexset_noalign)
-{
-
-  using applier = reduce_applier<TypeParam>;
-  using IndexType = typename applier::IndexType;
-  double* dvalue = ReduceCUDA<TypeParam>::dvalue;
-
-  RangeSegment seg0(1, 230);
-  RangeSegment seg1(237, 385);
-  RangeSegment seg2(860, 1110);
-  RangeSegment seg3(2490, 4003);
-
-  UnitIndexSet iset;
-  iset.push_back(seg0);
-  iset.push_back(seg1);
-  iset.push_back(seg2);
-  iset.push_back(seg3);
-
-  for (int tcount = 0; tcount < test_repeat; ++tcount) {
-
-    reset(dvalue, TEST_VEC_LEN, applier::def());
-
-    reduce::detail::ValueLoc<double, IndexType> dcurrent(applier::def(), LocCompare<IndexType>().defaultVal());
-
-    TypeParam dmin0(applier::def(), -1);
-    TypeParam dmin1(applier::def(), -1);
-
-    // pick an index in one of the segments
-    int index = 97;                     // seg 0
-    if (tcount % 2 == 0) index = 297;   // seg 1
-    if (tcount % 3 == 0) index = 873;   // seg 2
-    if (tcount % 4 == 0) index = 3457;  // seg 3
-
-    double droll = dist(mt);
-    reduce::detail::ValueLoc<double, IndexType> randval(droll, LocCompare<IndexType>().getVal(index));
-    applier::updatedvalue(dvalue, randval, dcurrent);
-
-    forall<ExecPolicy<seq_segit, cuda_exec<block_size>>>(
-        iset, [=] RAJA_DEVICE(int i) {
-          applier::apply(dmin0, dvalue[i], IndexType(i));
-          applier::apply(dmin1, 2 * dvalue[i], IndexType(i));
-        });
-
-    ASSERT_FLOAT_EQ(dcurrent.val, double(dmin0));
-    ASSERT_FLOAT_EQ(2 * dcurrent.val, double(dmin1));
-    ASSERT_EQ(dcurrent.getLoc(), dmin0.getLoc());
-    ASSERT_EQ(dcurrent.getLoc(), dmin1.getLoc());
-  }
-}
-
-REGISTER_TYPED_TEST_SUITE_P(ReduceCUDA,
-                            generic,
-                            indexset_align,
-                            indexset_noalign);
-
-using MinLocTypes =
-    ::testing::Types<ReduceMinLoc<RAJA::cuda_reduce, double>>;
-INSTANTIATE_TYPED_TEST_SUITE_P(MinLoc, ReduceCUDA, MinLocTypes);
-
-using MaxLocTypes =
-    ::testing::Types<ReduceMaxLoc<RAJA::cuda_reduce, double>>;
-INSTANTIATE_TYPED_TEST_SUITE_P(MaxLoc, ReduceCUDA, MaxLocTypes);
-
-using MinLocTypesGenericIndex =
-    ::testing::Types<ReduceMinLoc<RAJA::cuda_reduce, double, Index>>;
-INSTANTIATE_TYPED_TEST_SUITE_P(MinLocGenericIndex, ReduceCUDA, MinLocTypesGenericIndex);
-
-using MaxLocTypesGenericIndex =
-    ::testing::Types<ReduceMaxLoc<RAJA::cuda_reduce, double, Index>>;
-INSTANTIATE_TYPED_TEST_SUITE_P(MaxLocGenericIndex, ReduceCUDA, MaxLocTypesGenericIndex);
diff --git a/test/unit/cuda/test-reduce-max.cpp b/test/unit/cuda/test-reduce-max.cpp
deleted file mode 100644
index 5888a6b5b5..0000000000
--- a/test/unit/cuda/test-reduce-max.cpp
+++ /dev/null
@@ -1,229 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for RAJA GPU max reductions.
-///
-
-#include <cfloat>
-#include <cstdio>
-#include <iostream>
-#include <random>
-#include <string>
-
-#include "RAJA/RAJA.hpp"
-#include "RAJA_gtest.hpp"
-
-using namespace RAJA;
-
-using UnitIndexSet = TypedIndexSet<RAJA::RangeSegment,
-                                   RAJA::ListSegment,
-                                   RAJA::RangeStrideSegment>;
-
-constexpr const RAJA::Index_type TEST_VEC_LEN = 1024 * 1024 * 8;
-
-static const int test_repeat = 10;
-static const size_t block_size = 256;
-static const double DEFAULT_VAL = -DBL_MAX;
-static const double BIG_VAL = 500.0;
-
-// for setting random values in arrays
-static std::random_device rd;
-static std::mt19937 mt(rd());
-static std::uniform_real_distribution<double> dist(-10, 10);
-static std::uniform_real_distribution<double> dist2(0, TEST_VEC_LEN - 1);
-
-static void reset(double* ptr, long length)
-{
-  for (long i = 0; i < length; ++i) {
-    ptr[i] = DEFAULT_VAL;
-  }
-}
-
-class ReduceMaxCUDA : public ::testing::Test
-{
-public:
-  static double* dvalue;
-  static void SetUpTestCase()
-  {
-    cudaErrchk(cudaMallocManaged((void**)&dvalue,
-                      sizeof(double) * TEST_VEC_LEN,
-                      cudaMemAttachGlobal));
-    reset(dvalue, TEST_VEC_LEN);
-  }
-  static void TearDownTestCase() { cudaErrchk(cudaFree(dvalue)); }
-};
-
-double* ReduceMaxCUDA::dvalue = nullptr;
-
-GPU_TEST_F(ReduceMaxCUDA, generic)
-{
-
-  double* dvalue = ReduceMaxCUDA::dvalue;
-  reset(dvalue, TEST_VEC_LEN);
-
-  double dcurrentMax = DEFAULT_VAL;
-
-  for (int tcount = 0; tcount < test_repeat; ++tcount) {
-
-    ReduceMax<cuda_reduce, double> dmax0; dmax0.reset(DEFAULT_VAL);
-    ReduceMax<cuda_reduce, double> dmax1(DEFAULT_VAL);
-    ReduceMax<cuda_reduce, double> dmax2(BIG_VAL);
-
-    int loops = 16;
-    for (int k = 0; k < loops; k++) {
-
-      double droll = dist(mt);
-      int index = int(dist2(mt));
-      if (droll > dvalue[index]) {
-        dvalue[index] = droll;
-        dcurrentMax = RAJA_MAX(dcurrentMax, droll);
-      }
-
-      forall<cuda_exec<block_size> >(RangeSegment(0, TEST_VEC_LEN),
-                                     [=] RAJA_HOST_DEVICE(int i) {
-                                       dmax0.max(dvalue[i]);
-                                       dmax1.max(2 * dvalue[i]);
-                                       dmax2.max(dvalue[i]);
-                                     });
-
-      ASSERT_FLOAT_EQ(dcurrentMax, dmax0.get());
-      ASSERT_FLOAT_EQ(dcurrentMax * 2, dmax1.get());
-      ASSERT_FLOAT_EQ(BIG_VAL, dmax2.get());
-    }
-
-    // Reset values and run again
-    dmax0.reset(DEFAULT_VAL);
-    dmax1.reset(DEFAULT_VAL);
-    dmax2.reset(BIG_VAL);
-
-    loops = 16;
-    for (int k = 0; k < loops; k++) {
-
-      double droll = dist(mt);
-      int index = int(dist2(mt));
-      if (droll > dvalue[index]) {
-        dvalue[index] = droll;
-        dcurrentMax = RAJA_MAX(dcurrentMax, droll);
-      }
-
-      forall<cuda_exec<block_size> >(RangeSegment(0, TEST_VEC_LEN),
-                                     [=] RAJA_DEVICE(int i) {
-                                       dmax0.max(dvalue[i]);
-                                       dmax1.max(2 * dvalue[i]);
-                                       dmax2.max(dvalue[i]);
-                                     });
-
-      ASSERT_FLOAT_EQ(dcurrentMax, dmax0.get());
-      ASSERT_FLOAT_EQ(dcurrentMax * 2, dmax1.get());
-      ASSERT_FLOAT_EQ(BIG_VAL, dmax2.get());
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////
-
-//
-// test 2 runs 2 reductions over complete array using an indexset
-//        with two range segments to check reduction object state
-//        is maintained properly across kernel invocations.
-//
-GPU_TEST_F(ReduceMaxCUDA, indexset_align)
-{
-
-  double* dvalue = ReduceMaxCUDA::dvalue;
-
-  reset(dvalue, TEST_VEC_LEN);
-
-  double dcurrentMax = DEFAULT_VAL;
-
-  for (int tcount = 0; tcount < test_repeat; ++tcount) {
-
-    RangeSegment seg0(0, TEST_VEC_LEN / 2);
-    RangeSegment seg1(TEST_VEC_LEN / 2, TEST_VEC_LEN);
-
-    UnitIndexSet iset;
-    iset.push_back(seg0);
-    iset.push_back(seg1);
-
-    ReduceMax<cuda_reduce, double> dmax0(DEFAULT_VAL);
-    ReduceMax<cuda_reduce, double> dmax1(DEFAULT_VAL);
-
-
-    double droll = dist(mt);
-    int index = int(dist2(mt));
-    if (droll > dvalue[index]) {
-      dvalue[index] = droll;
-      dcurrentMax = RAJA_MAX(dcurrentMax, droll);
-    }
-
-    forall<ExecPolicy<seq_segit, cuda_exec<block_size> > >(
-        iset, [=] RAJA_HOST_DEVICE(int i) {
-          dmax0.max(dvalue[i]);
-          dmax1.max(2 * dvalue[i]);
-        });
-
-    ASSERT_FLOAT_EQ(dcurrentMax, double(dmax0));
-    ASSERT_FLOAT_EQ(2 * dcurrentMax, double(dmax1));
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////
-
-//
-// test 3 runs 2 reductions over disjoint chunks of the array using
-//        an indexset with four range segments not aligned with
-//        warp boundaries to check that reduction mechanics don't
-//        depend on any sort of special indexing.
-//
-GPU_TEST_F(ReduceMaxCUDA, indexset_noalign)
-{
-
-  double* dvalue = ReduceMaxCUDA::dvalue;
-
-  RangeSegment seg0(1, 1230);
-  RangeSegment seg1(1237, 3385);
-  RangeSegment seg2(4860, 10110);
-  RangeSegment seg3(20490, 32003);
-
-  UnitIndexSet iset;
-  iset.push_back(seg0);
-  iset.push_back(seg1);
-  iset.push_back(seg2);
-  iset.push_back(seg3);
-
-  for (int tcount = 0; tcount < test_repeat; ++tcount) {
-
-    reset(dvalue, TEST_VEC_LEN);
-
-    double dcurrentMax = DEFAULT_VAL;
-
-    ReduceMax<cuda_reduce, double> dmax0(DEFAULT_VAL);
-    ReduceMax<cuda_reduce, double> dmax1(DEFAULT_VAL);
-
-    // pick an index in one of the segments
-    int index = 897;                     // seg 0
-    if (tcount % 2 == 0) index = 1297;   // seg 1
-    if (tcount % 3 == 0) index = 7853;   // seg 2
-    if (tcount % 4 == 0) index = 29457;  // seg 3
-
-    double droll = dist(mt);
-    if (droll > dvalue[index]) {
-      dvalue[index] = droll;
-      dcurrentMax = RAJA_MAX(dcurrentMax, droll);
-    }
-
-    forall<ExecPolicy<seq_segit, cuda_exec<block_size> > >(
-        iset, [=] RAJA_DEVICE(int i) {
-          dmax0.max(dvalue[i]);
-          dmax1.max(2 * dvalue[i]);
-        });
-
-    ASSERT_FLOAT_EQ(dcurrentMax, double(dmax0));
-    ASSERT_FLOAT_EQ(2 * dcurrentMax, double(dmax1));
-  }
-}
diff --git a/test/unit/cuda/test-reduce-min.cpp b/test/unit/cuda/test-reduce-min.cpp
deleted file mode 100644
index e3bb99cb01..0000000000
--- a/test/unit/cuda/test-reduce-min.cpp
+++ /dev/null
@@ -1,227 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for RAJA GPU min reductions.
-///
-
-#include <cfloat>
-#include <cstdio>
-#include <iostream>
-#include <random>
-#include <string>
-
-#include "RAJA/RAJA.hpp"
-#include "RAJA_gtest.hpp"
-
-using namespace RAJA;
-
-using UnitIndexSet = RAJA::TypedIndexSet<RAJA::RangeSegment,
-                                         RAJA::ListSegment,
-                                         RAJA::RangeStrideSegment>;
-
-constexpr const RAJA::Index_type TEST_VEC_LEN = 1024 * 1024 * 8;
-
-static const int test_repeat = 10;
-static const size_t block_size = 256;
-static const double DEFAULT_VAL = DBL_MAX;
-static const double BIG_VAL = -500.0;
-
-// for setting random values in arrays
-static std::random_device rd;
-static std::mt19937 mt(rd());
-static std::uniform_real_distribution<double> dist(-10, 10);
-static std::uniform_real_distribution<double> dist2(0, TEST_VEC_LEN - 1);
-
-static void reset(double* ptr, long length)
-{
-  for (long i = 0; i < length; ++i) {
-    ptr[i] = DEFAULT_VAL;
-  }
-}
-
-class ReduceMinCUDA : public ::testing::Test
-{
-public:
-  static double* dvalue;
-  static void SetUpTestCase()
-  {
-    cudaErrchk(cudaMallocManaged((void**)&dvalue,
-                      sizeof(double) * TEST_VEC_LEN,
-                      cudaMemAttachGlobal));
-    reset(dvalue, TEST_VEC_LEN);
-  }
-  static void TearDownTestCase() { cudaErrchk(cudaFree(dvalue)); }
-};
-
-double* ReduceMinCUDA::dvalue = nullptr;
-
-GPU_TEST_F(ReduceMinCUDA, generic)
-{
-
-  double* dvalue = ReduceMinCUDA::dvalue;
-  reset(dvalue, TEST_VEC_LEN);
-
-  double dcurrentMin = DEFAULT_VAL;
-
-  for (int tcount = 0; tcount < test_repeat; ++tcount) {
-
-    ReduceMin<cuda_reduce, double> dmin0; dmin0.reset(DEFAULT_VAL);
-    ReduceMin<cuda_reduce, double> dmin1(DEFAULT_VAL);
-    ReduceMin<cuda_reduce, double> dmin2(BIG_VAL);
-
-    int loops = 16;
-    for (int k = 0; k < loops; k++) {
-
-      double droll = dist(mt);
-      int index = int(dist2(mt));
-      if (dvalue[index] > droll) {
-        dvalue[index] = droll;
-        dcurrentMin = RAJA_MIN(dcurrentMin, droll);
-      }
-
-      forall<cuda_exec<block_size> >(RangeSegment(0, TEST_VEC_LEN),
-                                     [=] RAJA_DEVICE(int i) {
-                                       dmin0.min(dvalue[i]);
-                                       dmin1.min(2 * dvalue[i]);
-                                       dmin2.min(dvalue[i]);
-                                     });
-
-      ASSERT_FLOAT_EQ(dcurrentMin, dmin0.get());
-      ASSERT_FLOAT_EQ(dcurrentMin * 2, dmin1.get());
-      ASSERT_FLOAT_EQ(BIG_VAL, dmin2.get());
-    }
-
-    dmin0.reset(DEFAULT_VAL);
-    dmin1.reset(DEFAULT_VAL);
-    dmin2.reset(BIG_VAL);
-
-    loops = 16;
-    for (int k = 0; k < loops; k++) {
-
-      double droll = dist(mt);
-      int index = int(dist2(mt));
-      if (dvalue[index] > droll) {
-        dvalue[index] = droll;
-        dcurrentMin = RAJA_MIN(dcurrentMin, droll);
-      }
-
-      forall<cuda_exec<block_size> >(RangeSegment(0, TEST_VEC_LEN),
-                                     [=] RAJA_HOST_DEVICE(int i) {
-                                       dmin0.min(dvalue[i]);
-                                       dmin1.min(2 * dvalue[i]);
-                                       dmin2.min(dvalue[i]);
-                                     });
-
-      ASSERT_FLOAT_EQ(dcurrentMin, dmin0.get());
-      ASSERT_FLOAT_EQ(dcurrentMin * 2, dmin1.get());
-      ASSERT_FLOAT_EQ(BIG_VAL, dmin2.get());
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////
-
-//
-// test 2 runs 2 reductions over complete array using an indexset
-//        with two range segments to check reduction object state
-//        is maintained properly across kernel invocations.
-//
-GPU_TEST_F(ReduceMinCUDA, indexset_align)
-{
-
-  double* dvalue = ReduceMinCUDA::dvalue;
-
-  reset(dvalue, TEST_VEC_LEN);
-
-  double dcurrentMin = DEFAULT_VAL;
-
-  for (int tcount = 0; tcount < test_repeat; ++tcount) {
-
-    RangeSegment seg0(0, TEST_VEC_LEN / 2);
-    RangeSegment seg1(TEST_VEC_LEN / 2, TEST_VEC_LEN);
-
-    UnitIndexSet iset;
-    iset.push_back(seg0);
-    iset.push_back(seg1);
-
-    ReduceMin<cuda_reduce, double> dmin0(DEFAULT_VAL);
-    ReduceMin<cuda_reduce, double> dmin1(DEFAULT_VAL);
-
-    double droll = dist(mt);
-    int index = int(dist2(mt));
-    if (dvalue[index] > droll) {
-      dvalue[index] = droll;
-      dcurrentMin = RAJA_MIN(dcurrentMin, droll);
-    }
-
-    forall<ExecPolicy<seq_segit, cuda_exec<block_size> > >(
-        iset, [=] RAJA_DEVICE(int i) {
-          dmin0.min(dvalue[i]);
-          dmin1.min(2 * dvalue[i]);
-        });
-
-    ASSERT_FLOAT_EQ(dcurrentMin, double(dmin0));
-    ASSERT_FLOAT_EQ(2 * dcurrentMin, double(dmin1));
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////
-
-//
-// test 3 runs 2 reductions over disjoint chunks of the array using
-//        an indexset with four range segments not aligned with
-//        warp boundaries to check that reduction mechanics don't
-//        depend on any sort of special indexing.
-//
-GPU_TEST_F(ReduceMinCUDA, indexset_noalign)
-{
-
-  double* dvalue = ReduceMinCUDA::dvalue;
-
-  RangeSegment seg0(1, 1230);
-  RangeSegment seg1(1237, 3385);
-  RangeSegment seg2(4860, 10110);
-  RangeSegment seg3(20490, 32003);
-
-  UnitIndexSet iset;
-  iset.push_back(seg0);
-  iset.push_back(seg1);
-  iset.push_back(seg2);
-  iset.push_back(seg3);
-
-  for (int tcount = 0; tcount < test_repeat; ++tcount) {
-
-    reset(dvalue, TEST_VEC_LEN);
-
-    double dcurrentMin = DEFAULT_VAL;
-
-    ReduceMin<cuda_reduce, double> dmin0(DEFAULT_VAL);
-    ReduceMin<cuda_reduce, double> dmin1(DEFAULT_VAL);
-
-    // pick an index in one of the segments
-    int index = 897;                     // seg 0
-    if (tcount % 2 == 0) index = 1297;   // seg 1
-    if (tcount % 3 == 0) index = 7853;   // seg 2
-    if (tcount % 4 == 0) index = 29457;  // seg 3
-
-    double droll = dist(mt);
-    if (dvalue[index] > droll) {
-      dvalue[index] = droll;
-      dcurrentMin = RAJA_MIN(dcurrentMin, droll);
-    }
-
-    forall<ExecPolicy<seq_segit, cuda_exec<block_size> > >(
-        iset, [=] RAJA_HOST_DEVICE(int i) {
-          dmin0.min(dvalue[i]);
-          dmin1.min(2 * dvalue[i]);
-        });
-
-    ASSERT_FLOAT_EQ(dcurrentMin, double(dmin0));
-    ASSERT_FLOAT_EQ(2 * dcurrentMin, double(dmin1));
-  }
-}
diff --git a/test/unit/cuda/test-reduce-sum.cpp b/test/unit/cuda/test-reduce-sum.cpp
deleted file mode 100644
index 3758554e5b..0000000000
--- a/test/unit/cuda/test-reduce-sum.cpp
+++ /dev/null
@@ -1,277 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for RAJA GPU reductions.
-///
-
-#include <math.h>
-#include <cfloat>
-#include <cstdio>
-#include <iomanip>
-#include <iostream>
-#include <random>
-
-#include "RAJA/RAJA.hpp"
-#include "RAJA_gtest.hpp"
-
-using UnitIndexSet = RAJA::TypedIndexSet<RAJA::RangeSegment,
-                                         RAJA::ListSegment,
-                                         RAJA::RangeStrideSegment>;
-
-constexpr const int TEST_VEC_LEN = 1024 * 1024 * 5;
-
-using namespace RAJA;
-
-static const double dinit_val = 0.1;
-static const int iinit_val = 1;
-
-class ReduceSumCUDA : public ::testing::Test
-{
-public:
-  static void SetUpTestCase()
-  {
-
-    cudaErrchk(cudaMallocManaged((void**)&dvalue,
-                      sizeof(double) * TEST_VEC_LEN,
-                      cudaMemAttachGlobal));
-
-    for (int i = 0; i < TEST_VEC_LEN; ++i) {
-      dvalue[i] = dinit_val;
-    }
-
-    cudaErrchk(cudaMallocManaged((void**)&ivalue,
-                      sizeof(int) * TEST_VEC_LEN,
-                      cudaMemAttachGlobal));
-
-    for (int i = 0; i < TEST_VEC_LEN; ++i) {
-      ivalue[i] = iinit_val;
-    }
-
-    cudaErrchk(cudaMallocManaged((void**)&rand_dvalue,
-                      sizeof(double) * TEST_VEC_LEN,
-                      cudaMemAttachGlobal));
-  }
-
-  static void TearDownTestCase()
-  {
-    cudaErrchk(cudaFree(dvalue));
-    cudaErrchk(cudaFree(rand_dvalue));
-    cudaErrchk(cudaFree(ivalue));
-  }
-
-  static double* dvalue;
-  static double* rand_dvalue;
-  static int* ivalue;
-};
-
-double* ReduceSumCUDA::dvalue = nullptr;
-double* ReduceSumCUDA::rand_dvalue = nullptr;
-int* ReduceSumCUDA::ivalue = nullptr;
-
-const size_t block_size = 256;
-
-GPU_TEST_F(ReduceSumCUDA, staggered_sum)
-{
-  double* dvalue = ReduceSumCUDA::dvalue;
-
-  double dtinit = 5.0;
-
-  ReduceSum<cuda_reduce, double> dsum0(0.0);
-  ReduceSum<cuda_reduce, double> dsum1(dtinit * 1.0);
-  ReduceSum<cuda_reduce, double> dsum2(0.0);
-  ReduceSum<cuda_reduce, double> dsum3(dtinit * 3.0);
-  ReduceSum<cuda_reduce, double> dsum4(0.0);
-  ReduceSum<cuda_reduce, double> dsum5(dtinit * 5.0);
-  ReduceSum<cuda_reduce, double> dsum6(0.0);
-  ReduceSum<cuda_reduce, double> dsum7(dtinit * 7.0);
-
-  int loops = 2;
-  for (int k = 0; k < loops; k++) {
-
-    forall<cuda_exec<block_size> >(RangeSegment(0, TEST_VEC_LEN),
-                                   [=] RAJA_HOST_DEVICE(int i) {
-                                     dsum0 += dvalue[i];
-                                     dsum1 += dvalue[i] * 2.0;
-                                     dsum2 += dvalue[i] * 3.0;
-                                     dsum3 += dvalue[i] * 4.0;
-                                     dsum4 += dvalue[i] * 5.0;
-                                     dsum5 += dvalue[i] * 6.0;
-                                     dsum6 += dvalue[i] * 7.0;
-                                     dsum7 += dvalue[i] * 8.0;
-                                   });
-
-    double base_chk_val = dinit_val * double(TEST_VEC_LEN) * (k + 1);
-
-    ASSERT_FLOAT_EQ(1 * base_chk_val, dsum0.get());
-    ASSERT_FLOAT_EQ(2 * base_chk_val + (dtinit * 1.0), dsum1.get());
-    ASSERT_FLOAT_EQ(3 * base_chk_val, dsum2.get());
-    ASSERT_FLOAT_EQ(4 * base_chk_val + (dtinit * 3.0), dsum3.get());
-    ASSERT_FLOAT_EQ(5 * base_chk_val, dsum4.get());
-    ASSERT_FLOAT_EQ(6 * base_chk_val + (dtinit * 5.0), dsum5.get());
-    ASSERT_FLOAT_EQ(7 * base_chk_val, dsum6.get());
-    ASSERT_FLOAT_EQ(8 * base_chk_val + (dtinit * 7.0), dsum7.get());
-  }
-}
-
-GPU_TEST_F(ReduceSumCUDA, staggered_sum2)
-{
-  double* dvalue = ReduceSumCUDA::dvalue;
-
-  double dtinit = 5.0;
-
-  ReduceSum<cuda_reduce, double> dsum0(5.0);
-  ReduceSum<cuda_reduce, double> dsum1;
-  ReduceSum<cuda_reduce, double> dsum2(5.0);
-  ReduceSum<cuda_reduce, double> dsum3;
-  ReduceSum<cuda_reduce, double> dsum4(5.0);
-  ReduceSum<cuda_reduce, double> dsum5;
-  ReduceSum<cuda_reduce, double> dsum6(5.0);
-  ReduceSum<cuda_reduce, double> dsum7;
-
-  dsum0.reset(0.0);
-  dsum1.reset(dtinit * 1.0);
-  dsum2.reset(0.0);
-  dsum3.reset(dtinit * 3.0);
-  dsum4.reset(0.0);
-  dsum5.reset(dtinit * 5.0);
-  dsum6.reset(0.0);
-  dsum7.reset(dtinit * 7.0);
-
-  int loops = 2;
-  for (int k = 0; k < loops; k++) {
-
-    forall<cuda_exec<block_size> >(RangeSegment(0, TEST_VEC_LEN),
-                                   [=] RAJA_DEVICE(int i) {
-                                     dsum0 += dvalue[i];
-                                     dsum1 += dvalue[i] * 2.0;
-                                     dsum2 += dvalue[i] * 3.0;
-                                     dsum3 += dvalue[i] * 4.0;
-                                     dsum4 += dvalue[i] * 5.0;
-                                     dsum5 += dvalue[i] * 6.0;
-                                     dsum6 += dvalue[i] * 7.0;
-                                     dsum7 += dvalue[i] * 8.0;
-                                   });
-
-    double base_chk_val = dinit_val * double(TEST_VEC_LEN) * (k + 1);
-
-    ASSERT_FLOAT_EQ(1 * base_chk_val, dsum0.get());
-    ASSERT_FLOAT_EQ(2 * base_chk_val + (dtinit * 1.0), dsum1.get());
-    ASSERT_FLOAT_EQ(3 * base_chk_val, dsum2.get());
-    ASSERT_FLOAT_EQ(4 * base_chk_val + (dtinit * 3.0), dsum3.get());
-    ASSERT_FLOAT_EQ(5 * base_chk_val, dsum4.get());
-    ASSERT_FLOAT_EQ(6 * base_chk_val + (dtinit * 5.0), dsum5.get());
-    ASSERT_FLOAT_EQ(7 * base_chk_val, dsum6.get());
-    ASSERT_FLOAT_EQ(8 * base_chk_val + (dtinit * 7.0), dsum7.get());
-  }
-}
-
-
-//
-// test 3 runs 4 reductions (2 int, 2 double) over disjoint chunks
-//        of the array using an indexset with four range segments
-//        not aligned with warp boundaries to check that reduction
-//        mechanics don't depend on any sort of special indexing.
-//
-GPU_TEST_F(ReduceSumCUDA, indexset_noalign)
-{
-  double* dvalue = ReduceSumCUDA::dvalue;
-  int* ivalue = ReduceSumCUDA::ivalue;
-
-
-  RangeSegment seg0(1, 1230);
-  RangeSegment seg1(1237, 3385);
-  RangeSegment seg2(4860, 10110);
-  RangeSegment seg3(20490, 32003);
-
-  UnitIndexSet iset;
-  iset.push_back(seg0);
-  iset.push_back(seg1);
-  iset.push_back(seg2);
-  iset.push_back(seg3);
-
-  double dtinit = 5.0;
-  int itinit = 4;
-
-  ReduceSum<cuda_reduce, double> dsum0(dtinit * 1.0);
-  ReduceSum<cuda_reduce, int> isum1(itinit * 2);
-  ReduceSum<cuda_reduce, double> dsum2(dtinit * 3.0);
-  ReduceSum<cuda_reduce, int> isum3(itinit * 4);
-
-  forall<ExecPolicy<seq_segit, cuda_exec<block_size> > >(
-      iset, [=] RAJA_DEVICE(int i) {
-        dsum0 += dvalue[i];
-        isum1 += 2 * ivalue[i];
-        dsum2 += 3 * dvalue[i];
-        isum3 += 4 * ivalue[i];
-      });
-
-  double dbase_chk_val = dinit_val * double(iset.getLength());
-  int ibase_chk_val = iinit_val * double(iset.getLength());
-
-  ASSERT_FLOAT_EQ(double(dsum0), dbase_chk_val + (dtinit * 1.0));
-  ASSERT_EQ(int(isum1), 2 * ibase_chk_val + (itinit * 2));
-  ASSERT_FLOAT_EQ(double(dsum2), 3 * dbase_chk_val + (dtinit * 3.0));
-  ASSERT_EQ(int(isum3), 4 * ibase_chk_val + (itinit * 4));
-}
-
-GPU_TEST_F(ReduceSumCUDA, atomic_reduce)
-{
-  double* rand_dvalue = ReduceSumCUDA::rand_dvalue;
-
-  ReduceSum<cuda_reduce_atomic, double> dsumN(0.0);
-  ReduceSum<cuda_reduce_atomic, double> dsumP(0.0);
-
-  double neg_chk_val = 0.0;
-  double pos_chk_val = 0.0;
-
-  int loops = 3;
-
-  for (int k = 0; k < loops; k++) {
-
-    for (int i = 0; i < TEST_VEC_LEN; ++i) {
-      rand_dvalue[i] = drand48() - 0.5;
-      if (rand_dvalue[i] < 0.0) {
-        neg_chk_val += rand_dvalue[i];
-      } else {
-        pos_chk_val += rand_dvalue[i];
-      }
-    }
-    forall<cuda_exec<block_size> >(RangeSegment(0, TEST_VEC_LEN),
-                                   [=] RAJA_HOST_DEVICE(int i) {
-                                     if (rand_dvalue[i] < 0.0) {
-                                       dsumN += rand_dvalue[i];
-                                     } else {
-                                       dsumP += rand_dvalue[i];
-                                     }
-                                   });
-
-    ASSERT_FLOAT_EQ(dsumN.get(), neg_chk_val);
-    ASSERT_FLOAT_EQ(dsumP.get(), pos_chk_val);
-  }
-}
-
-GPU_TEST_F(ReduceSumCUDA, increasing_size)
-{
-  double* dvalue = ReduceSumCUDA::dvalue;
-
-  double dtinit = 5.0;
-
-  for (int size = block_size; size <= TEST_VEC_LEN; size += block_size) {
-
-    ReduceSum<cuda_reduce, double> dsum0(dtinit);
-
-    forall<cuda_exec<block_size, true> >(RangeSegment(0, size),
-                                         [=] RAJA_DEVICE(int i) {
-                                           dsum0 += dvalue[i];
-                                         });
-
-    double base_chk_val = dinit_val * double(size);
-
-    ASSERT_FLOAT_EQ(base_chk_val + dtinit, dsum0.get());
-  }
-}
diff --git a/test/unit/cuda/test-scan.cpp b/test/unit/cuda/test-scan.cpp
deleted file mode 100644
index 282c199352..0000000000
--- a/test/unit/cuda/test-scan.cpp
+++ /dev/null
@@ -1,218 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for RAJA GPU scan operations.
-///
-
-#include <algorithm>
-#include <numeric>
-#include <random>
-#include <tuple>
-#include <type_traits>
-
-#include <cstdlib>
-
-#include "RAJA/RAJA.hpp"
-
-#include "RAJA_gtest.hpp"
-#include "type_helper.hpp"
-
-static const int N = 32000;
-
-// Unit Test Space Exploration
-
-using ExecTypes = std::tuple<RAJA::cuda_exec<128>, RAJA::cuda_exec<256>>;
-
-
-using ReduceTypes = std::tuple<RAJA::operators::plus<int>,
-                               RAJA::operators::plus<double>,
-                               RAJA::operators::minimum<float>,
-                               RAJA::operators::minimum<double>,
-                               RAJA::operators::maximum<int>,
-                               RAJA::operators::maximum<float>>;
-
-using CrossTypes =
-    ForTesting<typename types::product<ExecTypes, ReduceTypes>::type>;
-
-template <typename Tuple>
-struct Info {
-  using exec = typename std::tuple_element<0, Tuple>::type;
-  using function = typename std::tuple_element<1, Tuple>::type;
-  using data_type = typename function::result_type;
-};
-
-template <typename Tuple>
-struct ScanCUDA : public ::testing::Test {
-
-  using data_type = typename Info<Tuple>::data_type;
-  static data_type* data;
-
-  static void SetUpTestCase()
-  {
-    cudaErrchk(cudaMallocManaged((void**)&data,
-                      sizeof(data_type) * N,
-                      cudaMemAttachGlobal));
-    std::iota(data, data + N, 1);
-    std::shuffle(data, data + N, std::mt19937{std::random_device{}()});
-  }
-
-  static void TearDownTestCase() { cudaErrchk(cudaFree(data)); }
-};
-
-template <typename Tuple>
-typename Info<Tuple>::data_type* ScanCUDA<Tuple>::data = nullptr;
-
-TYPED_TEST_SUITE_P(ScanCUDA);
-
-template <typename Function, typename T>
-::testing::AssertionResult check_inclusive(const T* actual, const T* original)
-{
-  T init = Function::identity();
-  for (int i = 0; i < N; ++i) {
-    init = Function()(init, *original);
-    if (*actual != init)
-      return ::testing::AssertionFailure()
-             << *actual << " != " << init << " (at index " << i << ")";
-    ++actual;
-    ++original;
-  }
-  return ::testing::AssertionSuccess();
-}
-
-template <typename Function, typename T>
-::testing::AssertionResult check_exclusive(const T* actual,
-                                           const T* original,
-                                           T init = Function::identity())
-{
-  for (int i = 0; i < N; ++i) {
-    if (*actual != init)
-      return ::testing::AssertionFailure()
-             << *actual << " != " << init << " (at index " << i << ")";
-    init = Function()(init, *original);
-    ++actual;
-    ++original;
-  }
-  return ::testing::AssertionSuccess();
-}
-
-GPU_TYPED_TEST_P(ScanCUDA, inclusive)
-{
-  using T = typename Info<TypeParam>::data_type;
-  using Function = typename Info<TypeParam>::function;
-
-  T* out;
-  cudaErrchk(cudaMallocManaged((void**)&out, sizeof(T) * N, cudaMemAttachGlobal));
-
-  RAJA::inclusive_scan(typename Info<TypeParam>::exec(),
-                       ScanCUDA<TypeParam>::data,
-                       ScanCUDA<TypeParam>::data + N,
-                       out,
-                       Function{});
-
-  ASSERT_TRUE(check_inclusive<Function>(out, ScanCUDA<TypeParam>::data));
-  cudaErrchk(cudaFree(out));
-}
-
-GPU_TYPED_TEST_P(ScanCUDA, inclusive_inplace)
-{
-  using T = typename Info<TypeParam>::data_type;
-  using Function = typename Info<TypeParam>::function;
-
-  T* data;
-  cudaErrchk(cudaMallocManaged((void**)&data, sizeof(T) * N, cudaMemAttachGlobal));
-  std::copy_n(ScanCUDA<TypeParam>::data, N, data);
-
-  RAJA::inclusive_scan_inplace(typename Info<TypeParam>::exec(),
-                               data,
-                               data + N,
-                               Function{});
-
-  ASSERT_TRUE(check_inclusive<Function>(data, ScanCUDA<TypeParam>::data));
-  cudaErrchk(cudaFree(data));
-}
-
-GPU_TYPED_TEST_P(ScanCUDA, exclusive)
-{
-  using T = typename Info<TypeParam>::data_type;
-  using Function = typename Info<TypeParam>::function;
-
-  T* out;
-  cudaErrchk(cudaMallocManaged((void**)&out, sizeof(T) * N, cudaMemAttachGlobal));
-
-  RAJA::exclusive_scan(typename Info<TypeParam>::exec(),
-                       ScanCUDA<TypeParam>::data,
-                       ScanCUDA<TypeParam>::data + N,
-                       out,
-                       Function{});
-
-  ASSERT_TRUE(check_exclusive<Function>(out, ScanCUDA<TypeParam>::data));
-  cudaErrchk(cudaFree(out));
-}
-
-GPU_TYPED_TEST_P(ScanCUDA, exclusive_inplace)
-{
-  using T = typename Info<TypeParam>::data_type;
-  using Function = typename Info<TypeParam>::function;
-
-  T* data;
-  cudaErrchk(cudaMallocManaged((void**)&data, sizeof(T) * N, cudaMemAttachGlobal));
-  std::copy_n(ScanCUDA<TypeParam>::data, N, data);
-
-  RAJA::exclusive_scan_inplace(typename Info<TypeParam>::exec(),
-                               data,
-                               data + N,
-                               Function{});
-
-  ASSERT_TRUE(check_exclusive<Function>(data, ScanCUDA<TypeParam>::data));
-  cudaErrchk(cudaFree(data));
-}
-
-GPU_TYPED_TEST_P(ScanCUDA, exclusive_offset)
-{
-  using T = typename Info<TypeParam>::data_type;
-  using Function = typename Info<TypeParam>::function;
-
-  T* out;
-  cudaErrchk(cudaMallocManaged((void**)&out, sizeof(T) * N, cudaMemAttachGlobal));
-
-  RAJA::exclusive_scan(typename Info<TypeParam>::exec(),
-                       ScanCUDA<TypeParam>::data,
-                       ScanCUDA<TypeParam>::data + N,
-                       out,
-                       Function{},
-                       T(2));
-
-  ASSERT_TRUE(check_exclusive<Function>(out, ScanCUDA<TypeParam>::data, T(2)));
-  cudaErrchk(cudaFree(out));
-}
-
-GPU_TYPED_TEST_P(ScanCUDA, exclusive_inplace_offset)
-{
-  using T = typename Info<TypeParam>::data_type;
-  using Function = typename Info<TypeParam>::function;
-
-  T* data;
-  cudaErrchk(cudaMallocManaged((void**)&data, sizeof(T) * N, cudaMemAttachGlobal));
-  std::copy_n(ScanCUDA<TypeParam>::data, N, data);
-
-  RAJA::exclusive_scan_inplace(
-      typename Info<TypeParam>::exec(), data, data + N, Function{}, T(2));
-
-  ASSERT_TRUE(check_exclusive<Function>(data, ScanCUDA<TypeParam>::data, T(2)));
-  cudaErrchk(cudaFree(data));
-}
-
-REGISTER_TYPED_TEST_SUITE_P(ScanCUDA,
-                           inclusive,
-                           inclusive_inplace,
-                           exclusive,
-                           exclusive_inplace,
-                           exclusive_offset,
-                           exclusive_inplace_offset);
-
-INSTANTIATE_TYPED_TEST_SUITE_P(ScanCUDATests, ScanCUDA, CrossTypes);
diff --git a/test/unit/hip/CMakeLists.txt b/test/unit/hip/CMakeLists.txt
index 5026c7da2e..fa7f458bd1 100644
--- a/test/unit/hip/CMakeLists.txt
+++ b/test/unit/hip/CMakeLists.txt
@@ -5,22 +5,10 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-raja_add_test(
-  NAME test-hip-scan
-  SOURCES test-scan.cpp)
-
 raja_add_test(
   NAME test-hip-reduce-sum
   SOURCES test-reduce-sum.cpp)
 
-raja_add_test(
-  NAME test-hip-reduce-min
-  SOURCES test-reduce-min.cpp)
-
-raja_add_test(
-  NAME test-hip-reduce-loc
-  SOURCES test-reduce-loc.cpp)
-
 raja_add_test(
   NAME test-hip-reduce-tupleloc
   SOURCES test-reduce-tupleloc.cpp)
@@ -29,18 +17,6 @@ raja_add_test(
   NAME test-hip-reduce-randloc
   SOURCES test-reduce-randloc.cpp)
 
-raja_add_test(
-  NAME test-hip-reduce-max
-  SOURCES test-reduce-max.cpp)
-
-raja_add_test(
-  NAME test-hip-forall
-  SOURCES test-forall.cpp)
-
-raja_add_test(
-  NAME test-hip-forall-view
-  SOURCES test-forall-view.cpp)
-
 raja_add_test(
   NAME test-hip-synchronize
   SOURCES test-synchronize.cpp)
diff --git a/test/unit/hip/test-forall-view.cpp b/test/unit/hip/test-forall-view.cpp
deleted file mode 100644
index a3a1aabdd6..0000000000
--- a/test/unit/hip/test-forall-view.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#include <cstdlib>
-
-#include <string>
-
-#include "RAJA/RAJA.hpp"
-#include "RAJA_gtest.hpp"
-
-using namespace RAJA;
-using namespace std;
-
-const size_t block_size = 256;
-
-static double* arr_h;
-static double* arr_d;
-static Index_type alen;
-static double test_val;
-
-struct ForallViewHIP : ::testing::Test {
-  virtual void SetUp()
-  {
-    alen = 100000;
-    test_val = 0.123;
-
-    arr_h = (double*)allocate_aligned(DATA_ALIGN, alen * sizeof(double));
-
-    for (Index_type i = 0; i < alen; ++i) {
-      arr_h[i] = double(rand() % 65536);
-    }
-
-    hipErrchk(hipMalloc((void**)&arr_d, alen * sizeof(double)));
-
-    hipErrchk(hipMemcpy(
-        arr_d, arr_h, alen * sizeof(double), hipMemcpyHostToDevice));
-  }
-
-  virtual void TearDown()
-  {
-    free_aligned(arr_h);
-    hipErrchk(hipFree(arr_d));
-  }
-};
-
-GPU_TEST_F(ForallViewHIP, ForallViewLayout)
-{
-  const Index_type alen = ::alen;
-  double* arr_h = ::arr_h;
-  double* arr_d = ::arr_d;
-  double test_val = ::test_val;
-
-  const RAJA::Layout<1> my_layout(alen);
-  RAJA::View<double, RAJA::Layout<1>> view(arr_d, my_layout);
-
-  forall<RAJA::hip_exec<block_size>>(RAJA::RangeSegment(0, alen),
-                                      [=] RAJA_HOST_DEVICE(Index_type i) {
-                                        view(i) = test_val;
-                                      });
-
-  hipErrchk(
-      hipMemcpy(arr_h, arr_d, alen * sizeof(double), hipMemcpyDeviceToHost));
-
-  for (Index_type i = 0; i < alen; ++i) {
-    EXPECT_EQ(arr_h[i], test_val);
-  }
-}
-
-GPU_TEST_F(ForallViewHIP, ForallViewOffsetLayout)
-{
-  const Index_type alen = ::alen;
-  double* arr_h = ::arr_h;
-  double* arr_d = ::arr_d;
-  double test_val = ::test_val;
-
-  RAJA::OffsetLayout<1> my_layout =
-      RAJA::make_offset_layout<1>({{1}}, {{alen + 1}});
-  RAJA::View<double, RAJA::OffsetLayout<1>> view(arr_d, my_layout);
-
-  forall<RAJA::hip_exec<block_size>>(RAJA::RangeSegment(1, alen + 1),
-                                      [=] RAJA_DEVICE(Index_type i) {
-                                        view(i) = test_val;
-                                      });
-
-  hipErrchk(
-      hipMemcpy(arr_h, arr_d, alen * sizeof(double), hipMemcpyDeviceToHost));
-
-  for (Index_type i = 0; i < alen; ++i) {
-    EXPECT_EQ(arr_h[i], test_val);
-  }
-}
-
-GPU_TEST_F(ForallViewHIP, ForallViewOffsetLayout2D)
-{
-
-  using RAJA::Index_type;
-  Index_type* box;
-  Index_type *d_box;
-  const Index_type DIM = 2;
-  const Index_type N = 2;
-  const Index_type boxSize = (N + 2) * (N + 2);
-
-  box = (Index_type *) malloc(boxSize*sizeof(Index_type));
-  hipMalloc((void**)&d_box, boxSize*sizeof(Index_type));
-
-  RAJA::OffsetLayout<DIM> layout =
-      RAJA::make_offset_layout<DIM>({{-1, -1}}, {{2, 2}});
-  RAJA::View<Index_type, RAJA::OffsetLayout<DIM>> boxview(d_box, layout);
-
-  forall<RAJA::hip_exec<256>>(RAJA::RangeSegment(0, N * N),
-                               [=] RAJA_HOST_DEVICE(Index_type i) {
-                                 const int col = i % N;
-                                 const int row = i / N;
-                                 boxview(row, col) = 1000;
-                               });
-
-  hipErrchk( hipMemcpy( box, d_box,
-              boxSize*sizeof(Index_type),
-              hipMemcpyDeviceToHost ) );
-
-  for (Index_type row = 0; row < N; ++row) {
-    for (Index_type col = 0; col < N; ++col) {
-      int id = (col + 1) + (N + 2) * (row + 1);
-      EXPECT_EQ(box[id], 1000);
-    }
-  }
-
-  hipFree(box);
-}
diff --git a/test/unit/hip/test-forall.cpp b/test/unit/hip/test-forall.cpp
deleted file mode 100644
index 52b1e51c7c..0000000000
--- a/test/unit/hip/test-forall.cpp
+++ /dev/null
@@ -1,300 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for RAJA GPU forall traversals.
-///
-
-#include <cfloat>
-#include <cstdio>
-#include <iostream>
-#include <random>
-#include <string>
-#include <vector>
-
-#include "RAJA/RAJA.hpp"
-
-#include "RAJA_gtest.hpp"
-
-using UnitIndexSet = RAJA::TypedIndexSet<RAJA::RangeSegment,
-                                         RAJA::ListSegment,
-                                         RAJA::RangeStrideSegment>;
-
-const size_t block_size = 256;
-
-static UnitIndexSet iset;
-static RAJA::Index_type array_length;
-static RAJA::RAJAVec<RAJA::Index_type> is_indices;
-static RAJA::Real_ptr parent, ref_array, test_array;
-static RAJA::Real_ptr d_parent, d_ref_array, d_test_array;
-
-struct ForallHip : ::testing::Test {
-  virtual void SetUp()
-  {
-    using namespace RAJA;
-    using namespace std;
-
-    //
-    //  Build vector of integers for creating List segments.
-    //
-    default_random_engine gen;
-    uniform_real_distribution<double> dist(0.0, 1.0);
-
-    RAJAVec<Index_type> lindices;
-    Index_type idx = 0;
-    while (lindices.size() < 10000) {
-      double dval = dist(gen);
-      if (dval > 0.3) {
-        lindices.push_back(idx);
-      }
-      idx++;
-    }
-
-    //
-    // Construct index set with mix of Range and List segments.
-    //
-    Index_type rbeg;
-    Index_type rend;
-    Index_type last_idx;
-    Index_type lseg_len = lindices.size();
-    RAJAVec<Index_type> lseg(lseg_len);
-    std::vector<Index_type> lseg_vec(lseg_len);
-
-    // Create empty Range segment
-    rbeg = 1;
-    rend = 1;
-    iset.push_back(RangeSegment(rbeg, rend));
-    last_idx = rend;
-
-    // Create Range segment
-    rbeg = 1;
-    rend = 15782;
-    iset.push_back(RangeSegment(rbeg, rend));
-    last_idx = rend;
-
-    // Create List segment
-    for (Index_type i = 0; i < lseg_len; ++i) {
-      lseg[i] = lindices[i] + last_idx + 3;
-    }
-    iset.push_back(ListSegment(&lseg[0], lseg_len));
-    last_idx = lseg[lseg_len - 1];
-
-    // Create List segment using alternate ctor
-    for (Index_type i = 0; i < lseg_len; ++i) {
-      lseg_vec[i] = lindices[i] + last_idx + 3;
-    }
-    iset.push_back(ListSegment(lseg_vec));
-    last_idx = lseg_vec[lseg_len - 1];
-
-    // Create Range segment
-    rbeg = last_idx + 16;
-    rend = rbeg + 20490;
-    iset.push_back(RangeSegment(rbeg, rend));
-    last_idx = rend;
-
-    // Create Range segment
-    rbeg = last_idx + 4;
-    rend = rbeg + 27595;
-    iset.push_back(RangeSegment(rbeg, rend));
-    last_idx = rend;
-
-    // Create List segment
-    for (Index_type i = 0; i < lseg_len; ++i) {
-      lseg[i] = lindices[i] + last_idx + 5;
-    }
-    iset.push_back(ListSegment(&lseg[0], lseg_len));
-    last_idx = lseg[lseg_len - 1];
-
-    // Create Range segment
-    rbeg = last_idx + 1;
-    rend = rbeg + 32003;
-    iset.push_back(RangeSegment(rbeg, rend));
-    last_idx = rend;
-
-    // Create List segment using alternate ctor
-    for (Index_type i = 0; i < lseg_len; ++i) {
-      lseg_vec[i] = lindices[i] + last_idx + 7;
-    }
-    iset.push_back(ListSegment(lseg_vec));
-    last_idx = lseg_vec[lseg_len - 1];
-
-    //
-    // Collect actual indices in index set for testing.
-    //
-    getIndices(is_indices, iset);
-
-    ///////////////////////////////////////////////////////////////////////////
-    //
-    // Set up data and reference solution for tests...
-    //
-    ///////////////////////////////////////////////////////////////////////////
-
-    array_length = last_idx + 1;
-    //
-    // Allocate and initialize managed data arrays.
-    //
-
-    Index_type max_size = (array_length > Index_type(is_indices.size()))
-                              ? array_length
-                              : is_indices.size();
-
-    parent = (RAJA::Real_ptr) malloc(sizeof(RAJA::Real_type) * max_size);
-    hipMalloc((void **)&d_parent, sizeof(RAJA::Real_type) * max_size);
-
-    for (Index_type i = 0; i < max_size; ++i) {
-      parent[i] = static_cast<Real_type>(rand() % 65536);
-    }
-    hipMemcpy(d_parent, parent, sizeof(RAJA::Real_type) * max_size, hipMemcpyHostToDevice);
-
-    test_array = (RAJA::Real_ptr) malloc(sizeof(RAJA::Real_type) * max_size);
-    hipMalloc((void **)&d_test_array, sizeof(RAJA::Real_type) * max_size);
-    hipMemset(d_test_array, 0, sizeof(RAJA::Real_type) * max_size);
-
-    ref_array = (RAJA::Real_ptr) malloc(sizeof(RAJA::Real_type) * max_size);
-  }
-
-  virtual void TearDown()
-  {
-    hipFree(::d_test_array);
-    hipFree(::d_parent);
-    free(::test_array);
-    free(::ref_array);
-    free(::parent);
-    ::iset = UnitIndexSet();
-    ::is_indices = RAJA::RAJAVec<RAJA::Index_type>();
-  }
-};
-
-///
-/// Run traversal with simple range-based iteration
-///
-GPU_TEST_F(ForallHip, forall_range)
-{
-  RAJA::Real_ptr parent = ::parent;
-  RAJA::Real_ptr test_array = ::test_array;
-  RAJA::Real_ptr ref_array = ::ref_array;
-  RAJA::Real_ptr d_parent = ::d_parent;
-  RAJA::Real_ptr d_test_array = ::d_test_array;
-
-  for (RAJA::Index_type i = 0; i < array_length; ++i) {
-    ref_array[i] = parent[i] * parent[i];
-  }
-
-  hipMemcpy(d_parent, parent, sizeof(RAJA::Real_type) * array_length, hipMemcpyHostToDevice);
-
-  RAJA::forall<RAJA::hip_exec<block_size>>(RAJA::RangeSegment(0, array_length),
-                                            [=] RAJA_HOST_DEVICE(
-                                                RAJA::Index_type idx) {
-                                              d_test_array[idx] =
-                                                  d_parent[idx] * d_parent[idx];
-                                            });
-  hipMemcpy(test_array, d_test_array, sizeof(RAJA::Real_type) * array_length, hipMemcpyDeviceToHost);
-
-  for (RAJA::Index_type i = 0; i < array_length; ++i) {
-    ASSERT_FLOAT_EQ(ref_array[i], test_array[i]);
-  }
-}
-
-///
-/// Run range Icount test in its simplest form for sanity check
-///
-GPU_TEST_F(ForallHip, forall_icount_range)
-{
-  RAJA::Real_ptr parent = ::parent;
-  RAJA::Real_ptr test_array = ::test_array;
-  RAJA::Real_ptr ref_array = ::ref_array;
-  RAJA::Real_ptr d_parent = ::d_parent;
-  RAJA::Real_ptr d_test_array = ::d_test_array;
-
-  //
-  // Generate reference result to check correctness.
-  // Note: Reference does not use RAJA!!!
-  //
-  for (RAJA::Index_type i = 0; i < array_length; ++i) {
-    ref_array[i] = parent[i] * parent[i];
-  }
-
-  hipMemcpy(d_parent, parent, sizeof(RAJA::Real_type) * array_length, hipMemcpyHostToDevice);
-
-  RAJA::forall_Icount<RAJA::hip_exec<block_size>>(
-      RAJA::RangeSegment(0, array_length),
-      0,
-      [=] RAJA_DEVICE(RAJA::Index_type icount, RAJA::Index_type idx) {
-        d_test_array[icount] = d_parent[idx] * d_parent[idx];
-      });
-  hipMemcpy(test_array, d_test_array, sizeof(RAJA::Real_type) * array_length, hipMemcpyDeviceToHost);
-
-  for (RAJA::Index_type i = 0; i < array_length; ++i) {
-    ASSERT_FLOAT_EQ(ref_array[i], test_array[i]);
-  }
-}
-
-///
-/// Run traversal test with IndexSet containing multiple segments.
-///
-GPU_TEST_F(ForallHip, forall_indexset)
-{
-  RAJA::Real_ptr parent = ::parent;
-  RAJA::Real_ptr test_array = ::test_array;
-  RAJA::Real_ptr ref_array = ::ref_array;
-  RAJA::Real_ptr d_parent = ::d_parent;
-  RAJA::Real_ptr d_test_array = ::d_test_array;
-
-  //
-  // Generate reference result to check correctness.
-  // Note: Reference does not use RAJA!!!
-  //
-  memset(ref_array, 0, sizeof(RAJA::Real_type) * array_length);
-  for (decltype(is_indices.size()) i = 0; i < is_indices.size(); ++i) {
-    ref_array[is_indices[i]] = parent[is_indices[i]] * parent[is_indices[i]];
-  }
-
-  hipMemset(d_test_array, 0, sizeof(RAJA::Real_type) * array_length);
-  hipMemcpy(d_parent, parent, sizeof(RAJA::Real_type) * array_length, hipMemcpyHostToDevice);
-
-  RAJA::forall<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<block_size>>>(
-      iset, [=] RAJA_HOST_DEVICE(RAJA::Index_type idx) {
-        d_test_array[idx] = d_parent[idx] * d_parent[idx];
-      });
-  hipMemcpy(test_array, d_test_array, sizeof(RAJA::Real_type) * array_length, hipMemcpyDeviceToHost);
-
-  for (RAJA::Index_type i = 0; i < array_length; ++i) {
-    ASSERT_FLOAT_EQ(ref_array[i], test_array[i]);
-  }
-}
-
-///
-/// Run Icount test with IndexSet containing multiple segments.
-///
-GPU_TEST_F(ForallHip, forall_icount_indexset)
-{
-  RAJA::Real_ptr parent = ::parent;
-  RAJA::Real_ptr test_array = ::test_array;
-  RAJA::Real_ptr ref_array = ::ref_array;
-  RAJA::Real_ptr d_parent = ::d_parent;
-  RAJA::Real_ptr d_test_array = ::d_test_array;
-
-  memset(ref_array, 0, sizeof(RAJA::Real_type) * array_length);
-  RAJA::Index_type test_alen = is_indices.size();
-  for (RAJA::Index_type i = 0; i < test_alen; ++i) {
-    ref_array[i] = parent[is_indices[i]] * parent[is_indices[i]];
-  }
-
-  hipMemset(d_test_array, 0, sizeof(RAJA::Real_type) * array_length);
-  hipMemcpy(d_parent, parent, sizeof(RAJA::Real_type) * array_length, hipMemcpyHostToDevice);
-
-  RAJA::forall_Icount<
-      RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<block_size>>>(
-      iset, [=] RAJA_DEVICE(RAJA::Index_type icount, RAJA::Index_type idx) {
-        d_test_array[icount] = d_parent[idx] * d_parent[idx];
-      });
-  hipMemcpy(test_array, d_test_array, sizeof(RAJA::Real_type) * array_length, hipMemcpyDeviceToHost);
-
-  for (RAJA::Index_type i = 0; i < array_length; ++i) {
-    ASSERT_FLOAT_EQ(ref_array[i], test_array[i]);
-  }
-}
diff --git a/test/unit/hip/test-reduce-loc.cpp b/test/unit/hip/test-reduce-loc.cpp
deleted file mode 100644
index 4a6bd8a55e..0000000000
--- a/test/unit/hip/test-reduce-loc.cpp
+++ /dev/null
@@ -1,353 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for RAJA GPU min-loc reductions.
-///
-
-#include <cfloat>
-#include <cstdio>
-#include <iostream>
-#include <random>
-#include <string>
-
-#include "RAJA/RAJA.hpp"
-#include "RAJA_gtest.hpp"
-
-using namespace RAJA;
-
-using UnitIndexSet = RAJA::TypedIndexSet<RAJA::RangeSegment,
-                                         RAJA::ListSegment,
-                                         RAJA::RangeStrideSegment>;
-
-constexpr const RAJA::Index_type TEST_VEC_LEN = 1024 * 8;
-
-static const int test_repeat = 10;
-static const size_t block_size = 256;
-
-// for setting random values in arrays
-static std::random_device rd;
-static std::mt19937 mt(rd());
-static std::uniform_real_distribution<double> dist(-10, 10);
-static std::uniform_real_distribution<double> dist2(0, TEST_VEC_LEN - 1);
-
-struct Index {
-  RAJA::Index_type idx;
-  RAJA_HOST_DEVICE constexpr Index() : idx(-1) {}
-  RAJA_HOST_DEVICE constexpr Index(RAJA::Index_type idx) : idx(idx) {}
-  RAJA_HOST_DEVICE constexpr bool operator==(const Index& rhs) const { return (idx == rhs.idx); }
-};
-
-template <typename T>
-struct LocCompare {
-   RAJA_HOST_DEVICE constexpr T defaultVal() const { return T(); }
-   RAJA_HOST_DEVICE constexpr T getVal(int idx) const { return T(idx); }
-};
-
-template <>
-struct LocCompare<Index_type> {
-   RAJA_HOST_DEVICE constexpr Index_type defaultVal() const { return -1; }
-   RAJA_HOST_DEVICE constexpr Index_type getVal(int idx) const { return idx; }
-};
-
-template <typename T>
-struct LocConvert {
-   RAJA_HOST_DEVICE constexpr T get(const T& idx) const { return idx; }
-};
-
-template <>
-struct LocConvert<Index> {
-   RAJA_HOST_DEVICE constexpr Index_type get(const Index& i) const { return i.idx; }
-};
-
-static void reset(double* ptr, long length, double def)
-{
-  for (long i = 0; i < length; ++i) {
-    ptr[i] = def;
-  }
-}
-
-template <typename T>
-struct reduce_applier;
-template <typename T, typename U, typename Index>
-struct reduce_applier<ReduceMinLoc<T, U, Index>> {
-  using IndexType = Index;
-  static U def() { return DBL_MAX; }
-  static U big() { return -500.0; }
-  template <bool B>
-  static void updatedvalue(U* dvalue,
-                           reduce::detail::ValueLoc<U, Index, B>& randval,
-                           reduce::detail::ValueLoc<U, Index, B>& dcurrent)
-  {
-    if (dvalue[LocConvert<IndexType>().get(randval.loc)] > randval.val) {
-      dvalue[LocConvert<IndexType>().get(randval.loc)] = randval.val;
-      apply(dcurrent, randval);
-    }
-  }
-  RAJA_HOST_DEVICE static void apply(ReduceMinLoc<T, U, Index> const& r,
-                                     U const& val,
-                                     Index i)
-  {
-    r.minloc(val, i);
-  }
-  template <bool B>
-  RAJA_HOST_DEVICE static void apply(reduce::detail::ValueLoc<U, Index, B>& l,
-                                     reduce::detail::ValueLoc<U, Index, B> const& r)
-  {
-    l = l > r ? r : l;
-  }
-  template <bool B>
-  static void cmp(ReduceMinLoc<T, U, Index>& l,
-                  reduce::detail::ValueLoc<U, Index, B> const& r)
-  {
-    ASSERT_FLOAT_EQ(r.val, l.get());
-    ASSERT_EQ(r.loc, l.getLoc());
-  }
-};
-template <typename T, typename U, typename Index>
-struct reduce_applier<ReduceMaxLoc<T, U, Index>> {
-  using IndexType = Index;
-  static U def() { return -DBL_MAX; }
-  static U big() { return 500.0; }
-  template <bool B>
-  static void updatedvalue(U* dvalue,
-                           reduce::detail::ValueLoc<U, Index, B>& randval,
-                           reduce::detail::ValueLoc<U, Index, B>& dcurrent)
-  {
-    if (randval.val > dvalue[LocConvert<IndexType>().get(randval.loc)]) {
-      dvalue[LocConvert<IndexType>().get(randval.loc)] = randval.val;
-      apply(dcurrent, randval);
-    }
-  }
-  RAJA_HOST_DEVICE static void apply(ReduceMaxLoc<T, U, Index> const& r,
-                                     U const& val,
-                                     Index i)
-  {
-    r.maxloc(val, i);
-  }
-  template <bool B>
-  RAJA_HOST_DEVICE static void apply(reduce::detail::ValueLoc<U, Index, B>& l,
-                                     reduce::detail::ValueLoc<U, Index, B> const& r)
-  {
-    l = l > r ? l : r;
-  }
-  template <bool B>
-  static void cmp(ReduceMaxLoc<T, U, Index>& l,
-                  reduce::detail::ValueLoc<U, Index, B> const& r)
-  {
-    ASSERT_FLOAT_EQ(r.val, l.get());
-    ASSERT_EQ(r.loc, l.getLoc());
-  }
-};
-
-template <typename Reducer>
-class ReduceHIP : public ::testing::Test
-{
-  using applier = reduce_applier<Reducer>;
-
-public:
-  static double* dvalue;
-  static double* d_dvalue;
-
-  static void SetUpTestCase()
-  {
-    dvalue = (double*) malloc(sizeof(double) * TEST_VEC_LEN);
-    hipMalloc((void**)&d_dvalue,
-                      sizeof(double) * TEST_VEC_LEN);
-    reset(dvalue, TEST_VEC_LEN, applier::def());
-    hipMemcpy(d_dvalue, dvalue, sizeof(double) * TEST_VEC_LEN, hipMemcpyHostToDevice);
-  }
-  static void TearDownTestCase() { free(dvalue); hipFree(d_dvalue); }
-};
-
-template <typename Reducer>
-double* ReduceHIP<Reducer>::dvalue = nullptr;
-
-template <typename Reducer>
-double* ReduceHIP<Reducer>::d_dvalue = nullptr;
-
-
-TYPED_TEST_SUITE_P(ReduceHIP);
-
-GPU_TYPED_TEST_P(ReduceHIP, generic)
-{
-
-  using applier = reduce_applier<TypeParam>;
-  using IndexType = typename applier::IndexType;
-  using reducer = ReduceHIP<TypeParam>;
-  double* dvalue = reducer::dvalue;
-  double* d_dvalue = reducer::d_dvalue;
-
-  reset(dvalue, TEST_VEC_LEN, applier::def());
-  hipMemcpy(d_dvalue, dvalue, sizeof(double) * TEST_VEC_LEN, hipMemcpyHostToDevice);
-
-  reduce::detail::ValueLoc<double, IndexType> dcurrent(applier::def(), LocCompare<IndexType>().defaultVal());
-
-  for (int tcount = 0; tcount < test_repeat; ++tcount) {
-
-
-    TypeParam dmin0(applier::def(), -1);
-    TypeParam dmin1(applier::def(), -1);
-    TypeParam dmin2(applier::big(), -1);
-
-    int loops = 16;
-    for (int k = 0; k < loops; k++) {
-
-      double droll = dist(mt);
-      int index = int(dist2(mt));
-      reduce::detail::ValueLoc<double, IndexType> randval(droll, LocCompare<IndexType>().getVal(index));
-      applier::updatedvalue(dvalue, randval, dcurrent);
-      hipMemcpy(d_dvalue, dvalue, sizeof(double) * TEST_VEC_LEN, hipMemcpyHostToDevice);
-
-      forall<hip_exec<block_size>>(RAJA::RangeSegment(0, TEST_VEC_LEN),
-                                    [=] RAJA_DEVICE(int i) {
-                                      applier::apply(dmin0, d_dvalue[i], i);
-                                      applier::apply(dmin1, 2 * d_dvalue[i], i);
-                                      applier::apply(dmin2, d_dvalue[i], i);
-                                    });
-
-      applier::cmp(dmin0, dcurrent);
-
-      ASSERT_FLOAT_EQ(dcurrent.val * 2, dmin1.get());
-      ASSERT_EQ(dcurrent.getLoc(), dmin1.getLoc());
-      ASSERT_FLOAT_EQ(applier::big(), dmin2.get());
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////
-
-//
-// test 2 runs 2 reductions over complete array using an indexset
-//        with two range segments to check reduction object state
-//        is maintained properly across kernel invocations.
-//
-GPU_TYPED_TEST_P(ReduceHIP, indexset_align)
-{
-
-  using applier = reduce_applier<TypeParam>;
-  using IndexType = typename applier::IndexType;
-  double* dvalue = ReduceHIP<TypeParam>::dvalue;
-  double* d_dvalue = ReduceHIP<TypeParam>::d_dvalue;
-
-  reset(dvalue, TEST_VEC_LEN, applier::def());
-  hipMemcpy(d_dvalue, dvalue, sizeof(double) * TEST_VEC_LEN, hipMemcpyHostToDevice);
-
-  reduce::detail::ValueLoc<double, IndexType> dcurrent(applier::def(), LocCompare<IndexType>().defaultVal());
-
-  for (int tcount = 0; tcount < test_repeat; ++tcount) {
-
-    RangeSegment seg0(0, TEST_VEC_LEN / 2);
-    RangeSegment seg1(TEST_VEC_LEN / 2, TEST_VEC_LEN);
-
-    UnitIndexSet iset;
-    iset.push_back(seg0);
-    iset.push_back(seg1);
-
-    TypeParam dmin0(applier::def(), -1);
-    TypeParam dmin1(applier::def(), -1);
-
-    double droll = dist(mt);
-    int index = int(dist2(mt));
-    reduce::detail::ValueLoc<double, IndexType> randval(droll, LocCompare<IndexType>().getVal(index));
-    applier::updatedvalue(dvalue, randval, dcurrent);
-    hipMemcpy(d_dvalue, dvalue, sizeof(double) * TEST_VEC_LEN, hipMemcpyHostToDevice);
-
-    forall<ExecPolicy<seq_segit, hip_exec<block_size>>>(
-        iset, [=] RAJA_HOST_DEVICE(int i) {
-          applier::apply(dmin0, d_dvalue[i], i);
-          applier::apply(dmin1, 2 * d_dvalue[i], i);
-        });
-
-    ASSERT_FLOAT_EQ(double(dcurrent), double(dmin0));
-    ASSERT_FLOAT_EQ(2 * double(dcurrent), double(dmin1));
-    ASSERT_EQ(dcurrent.getLoc(), dmin0.getLoc());
-    ASSERT_EQ(dcurrent.getLoc(), dmin1.getLoc());
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////
-
-//
-// test 3 runs 2 reductions over disjoint chunks of the array using
-//        an indexset with four range segments not aligned with
-//        warp boundaries to check that reduction mechanics don't
-//        depend on any sort of special indexing.
-//
-GPU_TYPED_TEST_P(ReduceHIP, indexset_noalign)
-{
-
-  using applier = reduce_applier<TypeParam>;
-  using IndexType = typename applier::IndexType;
-  double* dvalue = ReduceHIP<TypeParam>::dvalue;
-  double* d_dvalue = ReduceHIP<TypeParam>::d_dvalue;
-
-  RangeSegment seg0(1, 230);
-  RangeSegment seg1(237, 385);
-  RangeSegment seg2(860, 1110);
-  RangeSegment seg3(2490, 4003);
-
-  UnitIndexSet iset;
-  iset.push_back(seg0);
-  iset.push_back(seg1);
-  iset.push_back(seg2);
-  iset.push_back(seg3);
-
-  for (int tcount = 0; tcount < test_repeat; ++tcount) {
-
-    reset(dvalue, TEST_VEC_LEN, applier::def());
-    hipMemcpy(d_dvalue, dvalue, sizeof(double) * TEST_VEC_LEN, hipMemcpyHostToDevice);
-
-    reduce::detail::ValueLoc<double, IndexType> dcurrent(applier::def(), LocCompare<IndexType>().defaultVal());
-
-    TypeParam dmin0(applier::def(), -1);
-    TypeParam dmin1(applier::def(), -1);
-
-    // pick an index in one of the segments
-    int index = 97;                     // seg 0
-    if (tcount % 2 == 0) index = 297;   // seg 1
-    if (tcount % 3 == 0) index = 873;   // seg 2
-    if (tcount % 4 == 0) index = 3457;  // seg 3
-
-    double droll = dist(mt);
-    reduce::detail::ValueLoc<double, IndexType> randval(droll, LocCompare<IndexType>().getVal(index));
-    applier::updatedvalue(dvalue, randval, dcurrent);
-    hipMemcpy(d_dvalue, dvalue, sizeof(double) * TEST_VEC_LEN, hipMemcpyHostToDevice);
-
-    forall<ExecPolicy<seq_segit, hip_exec<block_size>>>(
-        iset, [=] RAJA_DEVICE(int i) {
-          applier::apply(dmin0, d_dvalue[i], IndexType(i));
-          applier::apply(dmin1, 2 * d_dvalue[i], IndexType(i));
-        });
-
-    ASSERT_FLOAT_EQ(dcurrent.val, double(dmin0));
-    ASSERT_FLOAT_EQ(2 * dcurrent.val, double(dmin1));
-    ASSERT_EQ(dcurrent.getLoc(), dmin0.getLoc());
-    ASSERT_EQ(dcurrent.getLoc(), dmin1.getLoc());
-  }
-}
-
-REGISTER_TYPED_TEST_SUITE_P(ReduceHIP,
-                            generic,
-                            indexset_align,
-                            indexset_noalign);
-
-// using MinLocTypes =
-//     ::testing::Types<ReduceMinLoc<RAJA::hip_reduce, double>>;
-// INSTANTIATE_TYPED_TEST_SUITE_P(MinLoc, ReduceHIP, MinLocTypes);
-
-// using MaxLocTypes =
-//     ::testing::Types<ReduceMaxLoc<RAJA::hip_reduce, double>>;
-// INSTANTIATE_TYPED_TEST_SUITE_P(MaxLoc, ReduceHIP, MaxLocTypes);
-
-// using MinLocTypesGenericIndex =
-//     ::testing::Types<ReduceMinLoc<RAJA::hip_reduce, double, Index>>;
-// INSTANTIATE_TYPED_TEST_SUITE_P(MinLocGenericIndex, ReduceHIP, MinLocTypesGenericIndex);
-
-// using MaxLocTypesGenericIndex =
-//     ::testing::Types<ReduceMaxLoc<RAJA::hip_reduce, double, Index>>;
-// INSTANTIATE_TYPED_TEST_SUITE_P(MaxLocGenericIndex, ReduceHIP, MaxLocTypesGenericIndex);
diff --git a/test/unit/hip/test-reduce-max.cpp b/test/unit/hip/test-reduce-max.cpp
deleted file mode 100644
index 92987d6c77..0000000000
--- a/test/unit/hip/test-reduce-max.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for RAJA GPU max reductions.
-///
-
-#include <cfloat>
-#include <cstdio>
-#include <iostream>
-#include <random>
-#include <string>
-
-#include "RAJA/RAJA.hpp"
-#include "RAJA_gtest.hpp"
-
-using namespace RAJA;
-
-using UnitIndexSet = TypedIndexSet<RAJA::RangeSegment,
-                                   RAJA::ListSegment,
-                                   RAJA::RangeStrideSegment>;
-
-constexpr const RAJA::Index_type TEST_VEC_LEN = 1024 * 1024 * 8;
-
-static const int test_repeat = 10;
-static const size_t block_size = 256;
-static const double DEFAULT_VAL = -DBL_MAX;
-static const double BIG_VAL = 500.0;
-
-// for setting random values in arrays
-static std::random_device rd;
-static std::mt19937 mt(rd());
-static std::uniform_real_distribution<double> dist(-10, 10);
-static std::uniform_real_distribution<double> dist2(0, TEST_VEC_LEN - 1);
-
-static void reset(double* ptr, long length)
-{
-  for (long i = 0; i < length; ++i) {
-    ptr[i] = DEFAULT_VAL;
-  }
-}
-
-class ReduceMaxHIP : public ::testing::Test
-{
-public:
-  static double* dvalue;
-  static double* d_dvalue;
-  static void SetUpTestCase()
-  {
-    dvalue = (double*) malloc(sizeof(double) * TEST_VEC_LEN);
-    hipMalloc((void**)&d_dvalue,
-                      sizeof(double) * TEST_VEC_LEN);
-    reset(dvalue, TEST_VEC_LEN);
-  }
-  static void TearDownTestCase() { free(dvalue); hipFree(d_dvalue); }
-};
-
-double* ReduceMaxHIP::dvalue = nullptr;
-double* ReduceMaxHIP::d_dvalue = nullptr;
-
-GPU_TEST_F(ReduceMaxHIP, generic)
-{
-
-  double* dvalue = ReduceMaxHIP::dvalue;
-  double* d_dvalue = ReduceMaxHIP::d_dvalue;
-  reset(dvalue, TEST_VEC_LEN);
-  hipMemcpy(d_dvalue, dvalue, sizeof(double) * TEST_VEC_LEN, hipMemcpyHostToDevice);
-
-  double dcurrentMax = DEFAULT_VAL;
-
-  for (int tcount = 0; tcount < test_repeat; ++tcount) {
-
-    ReduceMax<hip_reduce, double> dmax0; dmax0.reset(DEFAULT_VAL);
-    ReduceMax<hip_reduce, double> dmax1(DEFAULT_VAL);
-    ReduceMax<hip_reduce, double> dmax2(BIG_VAL);
-
-    int loops = 16;
-    for (int k = 0; k < loops; k++) {
-
-      double droll = dist(mt);
-      int index = int(dist2(mt));
-      if (droll > dvalue[index]) {
-        dvalue[index] = droll;
-        dcurrentMax = RAJA_MAX(dcurrentMax, droll);
-      }
-      hipMemcpy(d_dvalue, dvalue, sizeof(double) * TEST_VEC_LEN, hipMemcpyHostToDevice);
-
-      forall<hip_exec<block_size> >(RangeSegment(0, TEST_VEC_LEN),
-                                     [=] RAJA_HOST_DEVICE(int i) {
-                                       dmax0.max(d_dvalue[i]);
-                                       dmax1.max(2 * d_dvalue[i]);
-                                       dmax2.max(d_dvalue[i]);
-                                     });
-
-      ASSERT_FLOAT_EQ(dcurrentMax, dmax0.get());
-      ASSERT_FLOAT_EQ(dcurrentMax * 2, dmax1.get());
-      ASSERT_FLOAT_EQ(BIG_VAL, dmax2.get());
-    }
-
-    // Reset values and run again
-    dmax0.reset(DEFAULT_VAL);
-    dmax1.reset(DEFAULT_VAL);
-    dmax2.reset(BIG_VAL);
-
-    loops = 16;
-    for (int k = 0; k < loops; k++) {
-
-      double droll = dist(mt);
-      int index = int(dist2(mt));
-      if (droll > dvalue[index]) {
-        dvalue[index] = droll;
-        dcurrentMax = RAJA_MAX(dcurrentMax, droll);
-      }
-      hipMemcpy(d_dvalue, dvalue, sizeof(double) * TEST_VEC_LEN, hipMemcpyHostToDevice);
-
-      forall<hip_exec<block_size> >(RangeSegment(0, TEST_VEC_LEN),
-                                     [=] RAJA_DEVICE(int i) {
-                                       dmax0.max(d_dvalue[i]);
-                                       dmax1.max(2 * d_dvalue[i]);
-                                       dmax2.max(d_dvalue[i]);
-                                     });
-
-      ASSERT_FLOAT_EQ(dcurrentMax, dmax0.get());
-      ASSERT_FLOAT_EQ(dcurrentMax * 2, dmax1.get());
-      ASSERT_FLOAT_EQ(BIG_VAL, dmax2.get());
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////
-
-//
-// test 2 runs 2 reductions over complete array using an indexset
-//        with two range segments to check reduction object state
-//        is maintained properly across kernel invocations.
-//
-GPU_TEST_F(ReduceMaxHIP, indexset_align)
-{
-
-  double* dvalue = ReduceMaxHIP::dvalue;
-  double* d_dvalue = ReduceMaxHIP::d_dvalue;
-
-  reset(dvalue, TEST_VEC_LEN);
-  hipMemcpy(d_dvalue, dvalue, sizeof(double) * TEST_VEC_LEN, hipMemcpyHostToDevice);
-
-  double dcurrentMax = DEFAULT_VAL;
-
-  for (int tcount = 0; tcount < test_repeat; ++tcount) {
-
-    RangeSegment seg0(0, TEST_VEC_LEN / 2);
-    RangeSegment seg1(TEST_VEC_LEN / 2, TEST_VEC_LEN);
-
-    UnitIndexSet iset;
-    iset.push_back(seg0);
-    iset.push_back(seg1);
-
-    ReduceMax<hip_reduce, double> dmax0(DEFAULT_VAL);
-    ReduceMax<hip_reduce, double> dmax1(DEFAULT_VAL);
-
-
-    double droll = dist(mt);
-    int index = int(dist2(mt));
-    if (droll > dvalue[index]) {
-      dvalue[index] = droll;
-      dcurrentMax = RAJA_MAX(dcurrentMax, droll);
-    }
-    hipMemcpy(d_dvalue, dvalue, sizeof(double) * TEST_VEC_LEN, hipMemcpyHostToDevice);
-
-    forall<ExecPolicy<seq_segit, hip_exec<block_size> > >(
-        iset, [=] RAJA_HOST_DEVICE(int i) {
-          dmax0.max(d_dvalue[i]);
-          dmax1.max(2 * d_dvalue[i]);
-        });
-
-    ASSERT_FLOAT_EQ(dcurrentMax, double(dmax0));
-    ASSERT_FLOAT_EQ(2 * dcurrentMax, double(dmax1));
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////
-
-//
-// test 3 runs 2 reductions over disjoint chunks of the array using
-//        an indexset with four range segments not aligned with
-//        warp boundaries to check that reduction mechanics don't
-//        depend on any sort of special indexing.
-//
-GPU_TEST_F(ReduceMaxHIP, indexset_noalign)
-{
-
-  double* dvalue = ReduceMaxHIP::dvalue;
-  double* d_dvalue = ReduceMaxHIP::d_dvalue;
-
-  RangeSegment seg0(1, 1230);
-  RangeSegment seg1(1237, 3385);
-  RangeSegment seg2(4860, 10110);
-  RangeSegment seg3(20490, 32003);
-
-  UnitIndexSet iset;
-  iset.push_back(seg0);
-  iset.push_back(seg1);
-  iset.push_back(seg2);
-  iset.push_back(seg3);
-
-  for (int tcount = 0; tcount < test_repeat; ++tcount) {
-
-    reset(dvalue, TEST_VEC_LEN);
-
-    double dcurrentMax = DEFAULT_VAL;
-
-    ReduceMax<hip_reduce, double> dmax0(DEFAULT_VAL);
-    ReduceMax<hip_reduce, double> dmax1(DEFAULT_VAL);
-
-    // pick an index in one of the segments
-    int index = 897;                     // seg 0
-    if (tcount % 2 == 0) index = 1297;   // seg 1
-    if (tcount % 3 == 0) index = 7853;   // seg 2
-    if (tcount % 4 == 0) index = 29457;  // seg 3
-
-    double droll = dist(mt);
-    if (droll > dvalue[index]) {
-      dvalue[index] = droll;
-      dcurrentMax = RAJA_MAX(dcurrentMax, droll);
-    }
-    hipMemcpy(d_dvalue, dvalue, sizeof(double) * TEST_VEC_LEN, hipMemcpyHostToDevice);
-
-    forall<ExecPolicy<seq_segit, hip_exec<block_size> > >(
-        iset, [=] RAJA_DEVICE(int i) {
-          dmax0.max(d_dvalue[i]);
-          dmax1.max(2 * d_dvalue[i]);
-        });
-
-    ASSERT_FLOAT_EQ(dcurrentMax, double(dmax0));
-    ASSERT_FLOAT_EQ(2 * dcurrentMax, double(dmax1));
-  }
-}
diff --git a/test/unit/hip/test-reduce-min.cpp b/test/unit/hip/test-reduce-min.cpp
deleted file mode 100644
index 836f4181ac..0000000000
--- a/test/unit/hip/test-reduce-min.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for RAJA GPU min reductions.
-///
-
-#include <cfloat>
-#include <cstdio>
-#include <iostream>
-#include <random>
-#include <string>
-
-#include "RAJA/RAJA.hpp"
-#include "RAJA_gtest.hpp"
-
-using namespace RAJA;
-
-using UnitIndexSet = RAJA::TypedIndexSet<RAJA::RangeSegment,
-                                         RAJA::ListSegment,
-                                         RAJA::RangeStrideSegment>;
-
-constexpr const RAJA::Index_type TEST_VEC_LEN = 1024 * 1024 * 8;
-
-static const int test_repeat = 10;
-static const size_t block_size = 256;
-static const double DEFAULT_VAL = DBL_MAX;
-static const double BIG_VAL = -500.0;
-
-// for setting random values in arrays
-static std::random_device rd;
-static std::mt19937 mt(rd());
-static std::uniform_real_distribution<double> dist(-10, 10);
-static std::uniform_real_distribution<double> dist2(0, TEST_VEC_LEN - 1);
-
-static void reset(double* ptr, long length)
-{
-  for (long i = 0; i < length; ++i) {
-    ptr[i] = DEFAULT_VAL;
-  }
-}
-
-class ReduceMinHIP : public ::testing::Test
-{
-public:
-  static double* dvalue;
-  static double* d_dvalue;
-
-  static void SetUpTestCase()
-  {
-    dvalue = (double*) malloc(sizeof(double) * TEST_VEC_LEN);
-    hipMalloc((void**)&d_dvalue,
-                      sizeof(double) * TEST_VEC_LEN);
-    reset(dvalue, TEST_VEC_LEN);
-  }
-  static void TearDownTestCase() { free(dvalue); hipFree(d_dvalue); }
-};
-
-double* ReduceMinHIP::dvalue = nullptr;
-double* ReduceMinHIP::d_dvalue = nullptr;
-
-GPU_TEST_F(ReduceMinHIP, generic)
-{
-
-  double* dvalue = ReduceMinHIP::dvalue;
-  double* d_dvalue = ReduceMinHIP::d_dvalue;
-  reset(dvalue, TEST_VEC_LEN);
-  hipMemcpy(d_dvalue, dvalue, sizeof(double) * TEST_VEC_LEN, hipMemcpyHostToDevice);
-
-  double dcurrentMin = DEFAULT_VAL;
-
-  for (int tcount = 0; tcount < test_repeat; ++tcount) {
-
-    ReduceMin<hip_reduce, double> dmin0; dmin0.reset(DEFAULT_VAL);
-    ReduceMin<hip_reduce, double> dmin1(DEFAULT_VAL);
-    ReduceMin<hip_reduce, double> dmin2(BIG_VAL);
-
-    int loops = 16;
-    for (int k = 0; k < loops; k++) {
-
-      double droll = dist(mt);
-      int index = int(dist2(mt));
-      if (dvalue[index] > droll) {
-        dvalue[index] = droll;
-        dcurrentMin = RAJA_MIN(dcurrentMin, droll);
-      }
-      hipMemcpy(d_dvalue, dvalue, sizeof(double) * TEST_VEC_LEN, hipMemcpyHostToDevice);
-
-      forall<hip_exec<block_size> >(RangeSegment(0, TEST_VEC_LEN),
-                                     [=] RAJA_DEVICE(int i) {
-                                       dmin0.min(d_dvalue[i]);
-                                       dmin1.min(2 * d_dvalue[i]);
-                                       dmin2.min(d_dvalue[i]);
-                                     });
-
-      ASSERT_FLOAT_EQ(dcurrentMin, dmin0.get());
-      ASSERT_FLOAT_EQ(dcurrentMin * 2, dmin1.get());
-      ASSERT_FLOAT_EQ(BIG_VAL, dmin2.get());
-    }
-
-    dmin0.reset(DEFAULT_VAL);
-    dmin1.reset(DEFAULT_VAL);
-    dmin2.reset(BIG_VAL);
-
-    loops = 16;
-    for (int k = 0; k < loops; k++) {
-
-      double droll = dist(mt);
-      int index = int(dist2(mt));
-      if (dvalue[index] > droll) {
-        dvalue[index] = droll;
-        dcurrentMin = RAJA_MIN(dcurrentMin, droll);
-      }
-      hipMemcpy(d_dvalue, dvalue, sizeof(double) * TEST_VEC_LEN, hipMemcpyHostToDevice);
-
-      forall<hip_exec<block_size> >(RangeSegment(0, TEST_VEC_LEN),
-                                     [=] RAJA_HOST_DEVICE(int i) {
-                                       dmin0.min(d_dvalue[i]);
-                                       dmin1.min(2 * d_dvalue[i]);
-                                       dmin2.min(d_dvalue[i]);
-                                     });
-
-      ASSERT_FLOAT_EQ(dcurrentMin, dmin0.get());
-      ASSERT_FLOAT_EQ(dcurrentMin * 2, dmin1.get());
-      ASSERT_FLOAT_EQ(BIG_VAL, dmin2.get());
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////
-
-//
-// test 2 runs 2 reductions over complete array using an indexset
-//        with two range segments to check reduction object state
-//        is maintained properly across kernel invocations.
-//
-GPU_TEST_F(ReduceMinHIP, indexset_align)
-{
-
-  double* dvalue = ReduceMinHIP::dvalue;
-  double* d_dvalue = ReduceMinHIP::d_dvalue;
-
-  reset(dvalue, TEST_VEC_LEN);
-  hipMemcpy(d_dvalue, dvalue, sizeof(double) * TEST_VEC_LEN, hipMemcpyHostToDevice);
-
-  double dcurrentMin = DEFAULT_VAL;
-
-  for (int tcount = 0; tcount < test_repeat; ++tcount) {
-
-    RangeSegment seg0(0, TEST_VEC_LEN / 2);
-    RangeSegment seg1(TEST_VEC_LEN / 2, TEST_VEC_LEN);
-
-    UnitIndexSet iset;
-    iset.push_back(seg0);
-    iset.push_back(seg1);
-
-    ReduceMin<hip_reduce, double> dmin0(DEFAULT_VAL);
-    ReduceMin<hip_reduce, double> dmin1(DEFAULT_VAL);
-
-    double droll = dist(mt);
-    int index = int(dist2(mt));
-    if (dvalue[index] > droll) {
-      dvalue[index] = droll;
-      dcurrentMin = RAJA_MIN(dcurrentMin, droll);
-    }
-    hipMemcpy(d_dvalue, dvalue, sizeof(double) * TEST_VEC_LEN, hipMemcpyHostToDevice);
-
-    forall<ExecPolicy<seq_segit, hip_exec<block_size> > >(
-        iset, [=] RAJA_DEVICE(int i) {
-          dmin0.min(d_dvalue[i]);
-          dmin1.min(2 * d_dvalue[i]);
-        });
-
-    ASSERT_FLOAT_EQ(dcurrentMin, double(dmin0));
-    ASSERT_FLOAT_EQ(2 * dcurrentMin, double(dmin1));
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////
-
-//
-// test 3 runs 2 reductions over disjoint chunks of the array using
-//        an indexset with four range segments not aligned with
-//        warp boundaries to check that reduction mechanics don't
-//        depend on any sort of special indexing.
-//
-GPU_TEST_F(ReduceMinHIP, indexset_noalign)
-{
-
-  double* dvalue = ReduceMinHIP::dvalue;
-  double* d_dvalue = ReduceMinHIP::d_dvalue;
-
-  RangeSegment seg0(1, 1230);
-  RangeSegment seg1(1237, 3385);
-  RangeSegment seg2(4860, 10110);
-  RangeSegment seg3(20490, 32003);
-
-  UnitIndexSet iset;
-  iset.push_back(seg0);
-  iset.push_back(seg1);
-  iset.push_back(seg2);
-  iset.push_back(seg3);
-
-  for (int tcount = 0; tcount < test_repeat; ++tcount) {
-
-    reset(dvalue, TEST_VEC_LEN);
-
-    double dcurrentMin = DEFAULT_VAL;
-
-    ReduceMin<hip_reduce, double> dmin0(DEFAULT_VAL);
-    ReduceMin<hip_reduce, double> dmin1(DEFAULT_VAL);
-
-    // pick an index in one of the segments
-    int index = 897;                     // seg 0
-    if (tcount % 2 == 0) index = 1297;   // seg 1
-    if (tcount % 3 == 0) index = 7853;   // seg 2
-    if (tcount % 4 == 0) index = 29457;  // seg 3
-
-    double droll = dist(mt);
-    if (dvalue[index] > droll) {
-      dvalue[index] = droll;
-      dcurrentMin = RAJA_MIN(dcurrentMin, droll);
-    }
-    hipMemcpy(d_dvalue, dvalue, sizeof(double) * TEST_VEC_LEN, hipMemcpyHostToDevice);
-
-
-    forall<ExecPolicy<seq_segit, hip_exec<block_size> > >(
-        iset, [=] RAJA_HOST_DEVICE(int i) {
-          dmin0.min(d_dvalue[i]);
-          dmin1.min(2 * d_dvalue[i]);
-        });
-
-    ASSERT_FLOAT_EQ(dcurrentMin, double(dmin0));
-    ASSERT_FLOAT_EQ(2 * dcurrentMin, double(dmin1));
-  }
-}
diff --git a/test/unit/hip/test-reduce-randloc.cpp b/test/unit/hip/test-reduce-randloc.cpp
index 1c88b157f9..ce3b29c22c 100644
--- a/test/unit/hip/test-reduce-randloc.cpp
+++ b/test/unit/hip/test-reduce-randloc.cpp
@@ -114,7 +114,7 @@ struct funcapplier<ReduceMaxLoc<seq_reduce, NumType, Indexer>>
 
 // base test
 template <typename T>
-struct HIPReduceLocRandTest : public ::testing::Test
+struct HIPReduceLocRandUnitTest : public ::testing::Test
 {
   public:
   virtual void SetUp()
@@ -215,11 +215,11 @@ struct HIPReduceLocRandTest : public ::testing::Test
   int minloc;
 };
 
-TYPED_TEST_SUITE_P(HIPReduceLocRandTest);
+TYPED_TEST_SUITE_P(HIPReduceLocRandUnitTest);
 
 // Tests HIP reduce loc on array over one range.
 // Each iteration introduces a random value into the array.
-GPU_TYPED_TEST_P(HIPReduceLocRandTest, ReduceLocRandom)
+GPU_TYPED_TEST_P(HIPReduceLocRandUnitTest, ReduceLocRandom)
 {
   using applygpu = funcapplier<at_v<TypeParam, 0>>;
   using applycpu = funcapplier<at_v<TypeParam, 1>>;
@@ -254,7 +254,7 @@ GPU_TYPED_TEST_P(HIPReduceLocRandTest, ReduceLocRandom)
 // Tests HIP reduce loc on array with all same values, over segments.
 // HIP finds location in the last segment, 
 // while CPU seq_reduce finds location in first segment.
-GPU_TYPED_TEST_P(HIPReduceLocRandTest, ReduceLocSameHalves)
+GPU_TYPED_TEST_P(HIPReduceLocRandUnitTest, ReduceLocSameHalves)
 {
   using applygpu = funcapplier<at_v<TypeParam, 0>>;
   using applycpu = funcapplier<at_v<TypeParam, 1>>;
@@ -295,7 +295,7 @@ GPU_TYPED_TEST_P(HIPReduceLocRandTest, ReduceLocSameHalves)
 }
 
 // Tests HIP reduce loc on array with unique values, over segments.
-GPU_TYPED_TEST_P(HIPReduceLocRandTest, ReduceLocAscendingHalves)
+GPU_TYPED_TEST_P(HIPReduceLocRandUnitTest, ReduceLocAscendingHalves)
 {
   using applygpu = funcapplier<at_v<TypeParam, 0>>;
   using applycpu = funcapplier<at_v<TypeParam, 1>>;
@@ -341,7 +341,7 @@ GPU_TYPED_TEST_P(HIPReduceLocRandTest, ReduceLocAscendingHalves)
 // Tests HIP reduce loc on two segment halves of array.
 // Each test iteration introduces a random value within the segments.
 // Compare scaled HIP reduce loc vs. un-scaled HIP reduce loc.
-GPU_TYPED_TEST_P(HIPReduceLocRandTest, ReduceLocRandomHalves)
+GPU_TYPED_TEST_P(HIPReduceLocRandUnitTest, ReduceLocRandomHalves)
 {
   using applygpu = funcapplier<at_v<TypeParam, 0>>;
 
@@ -392,7 +392,7 @@ GPU_TYPED_TEST_P(HIPReduceLocRandTest, ReduceLocRandomHalves)
 // Segments being reduced are non-contiguous.
 // Each test iteration introduces a random value within the segments.
 // Compare scaled HIP reduce loc vs. un-scaled HIP reduce loc.
-GPU_TYPED_TEST_P(HIPReduceLocRandTest, ReduceLocRandomDisjoint)
+GPU_TYPED_TEST_P(HIPReduceLocRandUnitTest, ReduceLocRandomDisjoint)
 {
   using applygpu = funcapplier<at_v<TypeParam, 0>>;
 
@@ -445,23 +445,21 @@ GPU_TYPED_TEST_P(HIPReduceLocRandTest, ReduceLocRandomDisjoint)
   }
 }
 
-REGISTER_TYPED_TEST_SUITE_P( HIPReduceLocRandTest,
-                             ReduceLocRandom,
-                             ReduceLocSameHalves,
-                             ReduceLocAscendingHalves,
-                             ReduceLocRandomHalves,
-                             ReduceLocRandomDisjoint
-                          );
+REGISTER_TYPED_TEST_SUITE_P( HIPReduceLocRandUnitTest,
+                            ReduceLocRandom,
+                            ReduceLocSameHalves,
+                            ReduceLocAscendingHalves,
+                            ReduceLocRandomHalves,
+                            ReduceLocRandomDisjoint);
 
 using MinLocType = ::testing::Types<
                      list<ReduceMinLoc<RAJA::hip_reduce, int, int>,
                           ReduceMinLoc<RAJA::seq_reduce, int, int>>
                    >;
-INSTANTIATE_TYPED_TEST_SUITE_P(ReduceMin, HIPReduceLocRandTest, MinLocType);
+INSTANTIATE_TYPED_TEST_SUITE_P(ReduceMin, HIPReduceLocRandUnitTest, MinLocType);
 
 using MaxLocType = ::testing::Types<
                      list<ReduceMaxLoc<RAJA::hip_reduce, int, int>,
                           ReduceMaxLoc<RAJA::seq_reduce, int, int>>
                    >;
-INSTANTIATE_TYPED_TEST_SUITE_P(ReduceMax, HIPReduceLocRandTest, MaxLocType);
-
+INSTANTIATE_TYPED_TEST_SUITE_P(ReduceMax, HIPReduceLocRandUnitTest, MaxLocType);
diff --git a/test/unit/hip/test-reduce-sum.cpp b/test/unit/hip/test-reduce-sum.cpp
index 15fc4ad2f5..8486ae078b 100644
--- a/test/unit/hip/test-reduce-sum.cpp
+++ b/test/unit/hip/test-reduce-sum.cpp
@@ -30,7 +30,7 @@ using namespace RAJA;
 static const double dinit_val = 0.1;
 static const int iinit_val = 1;
 
-class ReduceSumHIP : public ::testing::Test
+class ReduceSumHIPUnitTest : public ::testing::Test
 {
 public:
   static void SetUpTestCase()
@@ -78,163 +78,19 @@ class ReduceSumHIP : public ::testing::Test
   static int *d_ivalue;
 };
 
-double* ReduceSumHIP::dvalue = nullptr;
-double* ReduceSumHIP::rand_dvalue = nullptr;
-int* ReduceSumHIP::ivalue = nullptr;
-double* ReduceSumHIP::d_dvalue = nullptr;
-double* ReduceSumHIP::d_rand_dvalue = nullptr;
-int* ReduceSumHIP::d_ivalue = nullptr;
+double* ReduceSumHIPUnitTest::dvalue = nullptr;
+double* ReduceSumHIPUnitTest::rand_dvalue = nullptr;
+int* ReduceSumHIPUnitTest::ivalue = nullptr;
+double* ReduceSumHIPUnitTest::d_dvalue = nullptr;
+double* ReduceSumHIPUnitTest::d_rand_dvalue = nullptr;
+int* ReduceSumHIPUnitTest::d_ivalue = nullptr;
 
 const size_t block_size = 256;
 
-GPU_TEST_F(ReduceSumHIP, staggered_sum)
+GPU_TEST_F(ReduceSumHIPUnitTest, atomic_reduce)
 {
-  double* dvalue = ReduceSumHIP::d_dvalue;
-
-  double dtinit = 5.0;
-
-  ReduceSum<hip_reduce, double> dsum0(0.0);
-  ReduceSum<hip_reduce, double> dsum1(dtinit * 1.0);
-  ReduceSum<hip_reduce, double> dsum2(0.0);
-  ReduceSum<hip_reduce, double> dsum3(dtinit * 3.0);
-  ReduceSum<hip_reduce, double> dsum4(0.0);
-  ReduceSum<hip_reduce, double> dsum5(dtinit * 5.0);
-  ReduceSum<hip_reduce, double> dsum6(0.0);
-  ReduceSum<hip_reduce, double> dsum7(dtinit * 7.0);
-
-  int loops = 2;
-  for (int k = 0; k < loops; k++) {
-
-    forall<hip_exec<block_size> >(RangeSegment(0, TEST_VEC_LEN),
-                                   [=] RAJA_HOST_DEVICE(int i) {
-                                     dsum0 += dvalue[i];
-                                     dsum1 += dvalue[i] * 2.0;
-                                     dsum2 += dvalue[i] * 3.0;
-                                     dsum3 += dvalue[i] * 4.0;
-                                     dsum4 += dvalue[i] * 5.0;
-                                     dsum5 += dvalue[i] * 6.0;
-                                     dsum6 += dvalue[i] * 7.0;
-                                     dsum7 += dvalue[i] * 8.0;
-                                   });
-
-    double base_chk_val = dinit_val * double(TEST_VEC_LEN) * (k + 1);
-
-    ASSERT_FLOAT_EQ(1 * base_chk_val, dsum0.get());
-    ASSERT_FLOAT_EQ(2 * base_chk_val + (dtinit * 1.0), dsum1.get());
-    ASSERT_FLOAT_EQ(3 * base_chk_val, dsum2.get());
-    ASSERT_FLOAT_EQ(4 * base_chk_val + (dtinit * 3.0), dsum3.get());
-    ASSERT_FLOAT_EQ(5 * base_chk_val, dsum4.get());
-    ASSERT_FLOAT_EQ(6 * base_chk_val + (dtinit * 5.0), dsum5.get());
-    ASSERT_FLOAT_EQ(7 * base_chk_val, dsum6.get());
-    ASSERT_FLOAT_EQ(8 * base_chk_val + (dtinit * 7.0), dsum7.get());
-  }
-}
-
-GPU_TEST_F(ReduceSumHIP, staggered_sum2)
-{
-  double* dvalue = ReduceSumHIP::d_dvalue;
-
-  double dtinit = 5.0;
-
-  ReduceSum<hip_reduce, double> dsum0(5.0);
-  ReduceSum<hip_reduce, double> dsum1;
-  ReduceSum<hip_reduce, double> dsum2(5.0);
-  ReduceSum<hip_reduce, double> dsum3;
-  ReduceSum<hip_reduce, double> dsum4(5.0);
-  ReduceSum<hip_reduce, double> dsum5;
-  ReduceSum<hip_reduce, double> dsum6(5.0);
-  ReduceSum<hip_reduce, double> dsum7;
-
-  dsum0.reset(0.0);
-  dsum1.reset(dtinit * 1.0);
-  dsum2.reset(0.0);
-  dsum3.reset(dtinit * 3.0);
-  dsum4.reset(0.0);
-  dsum5.reset(dtinit * 5.0);
-  dsum6.reset(0.0);
-  dsum7.reset(dtinit * 7.0);
-
-  int loops = 2;
-  for (int k = 0; k < loops; k++) {
-
-    forall<hip_exec<block_size> >(RangeSegment(0, TEST_VEC_LEN),
-                                   [=] RAJA_DEVICE(int i) {
-                                     dsum0 += dvalue[i];
-                                     dsum1 += dvalue[i] * 2.0;
-                                     dsum2 += dvalue[i] * 3.0;
-                                     dsum3 += dvalue[i] * 4.0;
-                                     dsum4 += dvalue[i] * 5.0;
-                                     dsum5 += dvalue[i] * 6.0;
-                                     dsum6 += dvalue[i] * 7.0;
-                                     dsum7 += dvalue[i] * 8.0;
-                                   });
-
-    double base_chk_val = dinit_val * double(TEST_VEC_LEN) * (k + 1);
-
-    ASSERT_FLOAT_EQ(1 * base_chk_val, dsum0.get());
-    ASSERT_FLOAT_EQ(2 * base_chk_val + (dtinit * 1.0), dsum1.get());
-    ASSERT_FLOAT_EQ(3 * base_chk_val, dsum2.get());
-    ASSERT_FLOAT_EQ(4 * base_chk_val + (dtinit * 3.0), dsum3.get());
-    ASSERT_FLOAT_EQ(5 * base_chk_val, dsum4.get());
-    ASSERT_FLOAT_EQ(6 * base_chk_val + (dtinit * 5.0), dsum5.get());
-    ASSERT_FLOAT_EQ(7 * base_chk_val, dsum6.get());
-    ASSERT_FLOAT_EQ(8 * base_chk_val + (dtinit * 7.0), dsum7.get());
-  }
-}
-
-
-//
-// test 3 runs 4 reductions (2 int, 2 double) over disjoint chunks
-//        of the array using an indexset with four range segments
-//        not aligned with warp boundaries to check that reduction
-//        mechanics don't depend on any sort of special indexing.
-//
-GPU_TEST_F(ReduceSumHIP, indexset_noalign)
-{
-  double* dvalue = ReduceSumHIP::d_dvalue;
-  int* ivalue = ReduceSumHIP::d_ivalue;
-
-
-  RangeSegment seg0(1, 1230);
-  RangeSegment seg1(1237, 3385);
-  RangeSegment seg2(4860, 10110);
-  RangeSegment seg3(20490, 32003);
-
-  UnitIndexSet iset;
-  iset.push_back(seg0);
-  iset.push_back(seg1);
-  iset.push_back(seg2);
-  iset.push_back(seg3);
-
-  double dtinit = 5.0;
-  int itinit = 4;
-
-  ReduceSum<hip_reduce, double> dsum0(dtinit * 1.0);
-  ReduceSum<hip_reduce, int> isum1(itinit * 2);
-  ReduceSum<hip_reduce, double> dsum2(dtinit * 3.0);
-  ReduceSum<hip_reduce, int> isum3(itinit * 4);
-
-  forall<ExecPolicy<seq_segit, hip_exec<block_size> > >(
-      iset, [=] RAJA_DEVICE(int i) {
-        dsum0 += dvalue[i];
-        isum1 += 2 * ivalue[i];
-        dsum2 += 3 * dvalue[i];
-        isum3 += 4 * ivalue[i];
-      });
-
-  double dbase_chk_val = dinit_val * double(iset.getLength());
-  int ibase_chk_val = iinit_val * double(iset.getLength());
-
-  ASSERT_FLOAT_EQ(double(dsum0), dbase_chk_val + (dtinit * 1.0));
-  ASSERT_EQ(int(isum1), 2 * ibase_chk_val + (itinit * 2));
-  ASSERT_FLOAT_EQ(double(dsum2), 3 * dbase_chk_val + (dtinit * 3.0));
-  ASSERT_EQ(int(isum3), 4 * ibase_chk_val + (itinit * 4));
-}
-
-GPU_TEST_F(ReduceSumHIP, atomic_reduce)
-{
-  double* rand_dvalue = ReduceSumHIP::rand_dvalue;
-  double* d_rand_dvalue = ReduceSumHIP::d_rand_dvalue;
+  double* rand_dvalue = ReduceSumHIPUnitTest::rand_dvalue;
+  double* d_rand_dvalue = ReduceSumHIPUnitTest::d_rand_dvalue;
 
   ReduceSum<hip_reduce_atomic, double> dsumN(0.0);
   ReduceSum<hip_reduce_atomic, double> dsumP(0.0);
@@ -272,9 +128,9 @@ GPU_TEST_F(ReduceSumHIP, atomic_reduce)
   }
 }
 
-GPU_TEST_F(ReduceSumHIP, increasing_size)
+GPU_TEST_F(ReduceSumHIPUnitTest, increasing_size)
 {
-  double* dvalue = ReduceSumHIP::d_dvalue;
+  double* dvalue = ReduceSumHIPUnitTest::d_dvalue;
 
   double dtinit = 5.0;
 
diff --git a/test/unit/hip/test-reduce-tupleloc.cpp b/test/unit/hip/test-reduce-tupleloc.cpp
index d00d4829a8..989059efd7 100644
--- a/test/unit/hip/test-reduce-tupleloc.cpp
+++ b/test/unit/hip/test-reduce-tupleloc.cpp
@@ -110,7 +110,7 @@ struct funcapplier<ReduceMaxLoc<seq_reduce, NumType, Indexer>>    // CPU maxloc
 
 // base test
 template <typename T>
-struct HIPReduceLocTest : public ::testing::Test
+struct HIPReduceLocUnitTest : public ::testing::Test
 {
   public:
   virtual void SetUp()
@@ -254,9 +254,9 @@ struct HIPReduceLocTest : public ::testing::Test
   RAJA::Real_type minlocy;
 };
 
-TYPED_TEST_SUITE_P(HIPReduceLocTest);
+TYPED_TEST_SUITE_P(HIPReduceLocUnitTest);
 
-GPU_TYPED_TEST_P(HIPReduceLocTest, ReduceLoc2DIndexTupleViewKernel)
+GPU_TYPED_TEST_P(HIPReduceLocUnitTest, ReduceLoc2DIndexTupleViewKernel)
 {
   using applygpu = funcapplier<at_v<TypeParam, 0>>;
   using applycpu = funcapplier<at_v<TypeParam, 1>>;
@@ -320,7 +320,7 @@ GPU_TYPED_TEST_P(HIPReduceLocTest, ReduceLoc2DIndexTupleViewKernel)
   ASSERT_EQ(cpuloc_reducer.getLoc(), RAJA::get<0>(minmaxloc_reducer2.getLoc()) + RAJA::get<1>(minmaxloc_reducer2.getLoc()) * ydim);
 }
 
-GPU_TYPED_TEST_P(HIPReduceLocTest, ReduceLoc2DIndexTupleViewKernelRandom)
+GPU_TYPED_TEST_P(HIPReduceLocUnitTest, ReduceLoc2DIndexTupleViewKernelRandom)
 {
   using applygpu = funcapplier<at_v<TypeParam, 0>>;
   using applycpu = funcapplier<at_v<TypeParam, 1>>;
@@ -377,20 +377,18 @@ GPU_TYPED_TEST_P(HIPReduceLocTest, ReduceLoc2DIndexTupleViewKernelRandom)
   }
 }
 
-REGISTER_TYPED_TEST_SUITE_P( HIPReduceLocTest,
-                             ReduceLoc2DIndexTupleViewKernel,
-                             ReduceLoc2DIndexTupleViewKernelRandom
-                          );
+REGISTER_TYPED_TEST_SUITE_P( HIPReduceLocUnitTest,
+                            ReduceLoc2DIndexTupleViewKernel,
+                            ReduceLoc2DIndexTupleViewKernelRandom);
 
 using MinLocTypeTuple = ::testing::Types<
                           list<ReduceMinLoc<RAJA::hip_reduce, double, RAJA::tuple<int, int>>,
                                ReduceMinLoc<RAJA::seq_reduce, double, int>>
                         >;
-INSTANTIATE_TYPED_TEST_SUITE_P(ReduceMin2DTuple, HIPReduceLocTest, MinLocTypeTuple);
+INSTANTIATE_TYPED_TEST_SUITE_P(ReduceMin2DTuple, HIPReduceLocUnitTest, MinLocTypeTuple);
 
 using MaxLocTypeTuple = ::testing::Types<
                           list<ReduceMaxLoc<RAJA::hip_reduce, double, RAJA::tuple<int, int>>,
                                ReduceMaxLoc<RAJA::seq_reduce, double, int>>
                         >;
-INSTANTIATE_TYPED_TEST_SUITE_P(ReduceMax2DTuple, HIPReduceLocTest, MaxLocTypeTuple);
-
+INSTANTIATE_TYPED_TEST_SUITE_P(ReduceMax2DTuple, HIPReduceLocUnitTest, MaxLocTypeTuple);
diff --git a/test/unit/hip/test-scan.cpp b/test/unit/hip/test-scan.cpp
deleted file mode 100644
index 106e99abb9..0000000000
--- a/test/unit/hip/test-scan.cpp
+++ /dev/null
@@ -1,273 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for RAJA GPU scan operations.
-///
-
-#include <algorithm>
-#include <numeric>
-#include <random>
-#include <tuple>
-#include <type_traits>
-
-#include <cstdlib>
-
-#include "RAJA/RAJA.hpp"
-
-#include "RAJA_gtest.hpp"
-#include "type_helper.hpp"
-
-static const int N = 32000;
-
-// Unit Test Space Exploration
-
-using ExecTypes = std::tuple<RAJA::hip_exec<128>, RAJA::hip_exec<256>>;
-
-
-using ReduceTypes = std::tuple<RAJA::operators::plus<int>,
-                               RAJA::operators::plus<double>,
-                               RAJA::operators::minimum<float>,
-                               RAJA::operators::minimum<double>,
-                               RAJA::operators::maximum<int>,
-                               RAJA::operators::maximum<float>>;
-
-using CrossTypes =
-    ForTesting<typename types::product<ExecTypes, ReduceTypes>::type>;
-
-template <typename Tuple>
-struct Info {
-  using exec = typename std::tuple_element<0, Tuple>::type;
-  using function = typename std::tuple_element<1, Tuple>::type;
-  using data_type = typename function::result_type;
-};
-
-template <typename Tuple>
-struct ScanHIP : public ::testing::Test {
-
-  using data_type = typename Info<Tuple>::data_type;
-  static data_type* data;
-  static data_type* d_data;
-
-  static void SetUpTestCase()
-  {
-    data = (data_type*) malloc(sizeof(data_type) * N);
-    hipMalloc((void**)&d_data, sizeof(data_type) * N);
-    std::iota(data, data + N, 1);
-    std::shuffle(data, data + N, std::mt19937{std::random_device{}()});
-    hipMemcpy(d_data, data,
-              sizeof(data_type) * N,
-              hipMemcpyHostToDevice);
-  }
-
-  static void TearDownTestCase() { free(data); hipFree(d_data); }
-};
-
-template <typename Tuple>
-typename Info<Tuple>::data_type* ScanHIP<Tuple>::data = nullptr;
-template <typename Tuple>
-typename Info<Tuple>::data_type* ScanHIP<Tuple>::d_data = nullptr;
-
-TYPED_TEST_SUITE_P(ScanHIP);
-
-template <typename Function, typename T>
-::testing::AssertionResult check_inclusive(const T* actual, const T* original)
-{
-  T init = Function::identity();
-  for (int i = 0; i < N; ++i) {
-    init = Function()(init, *original);
-    if (*actual != init)
-      return ::testing::AssertionFailure()
-             << *actual << " != " << init << " (at index " << i << ")";
-    ++actual;
-    ++original;
-  }
-  return ::testing::AssertionSuccess();
-}
-
-template <typename Function, typename T>
-::testing::AssertionResult check_exclusive(const T* actual,
-                                           const T* original,
-                                           T init = Function::identity())
-{
-  for (int i = 0; i < N; ++i) {
-    if (*actual != init)
-      return ::testing::AssertionFailure()
-             << *actual << " != " << init << " (at index " << i << ")";
-    init = Function()(init, *original);
-    ++actual;
-    ++original;
-  }
-  return ::testing::AssertionSuccess();
-}
-
-GPU_TYPED_TEST_P(ScanHIP, inclusive)
-{
-  using T = typename Info<TypeParam>::data_type;
-  using Function = typename Info<TypeParam>::function;
-
-  T* out;
-  T* d_out;
-  out = (T*) malloc(sizeof(T) * N);
-  hipMalloc((void**)&d_out, sizeof(T) * N);
-
-  RAJA::inclusive_scan(typename Info<TypeParam>::exec(),
-                       ScanHIP<TypeParam>::d_data,
-                       ScanHIP<TypeParam>::d_data + N,
-                       d_out,
-                       Function{});
-
-  hipMemcpy(out, d_out,
-            sizeof(T) * N,
-            hipMemcpyDeviceToHost);
-
-  ASSERT_TRUE(check_inclusive<Function>(out, ScanHIP<TypeParam>::data));
-  free(out);
-  hipFree(d_out);
-}
-
-GPU_TYPED_TEST_P(ScanHIP, inclusive_inplace)
-{
-  using T = typename Info<TypeParam>::data_type;
-  using Function = typename Info<TypeParam>::function;
-
-  T* data;
-  T* d_data;
-  data = (T*) malloc(sizeof(T) * N);
-  hipMalloc((void**)&d_data, sizeof(T) * N);
-  std::copy_n(ScanHIP<TypeParam>::data, N, data);
-  hipMemcpy(d_data, data,
-            sizeof(T) * N,
-            hipMemcpyHostToDevice);
-
-  RAJA::inclusive_scan_inplace(typename Info<TypeParam>::exec(),
-                               d_data,
-                               d_data + N,
-                               Function{});
-
-  hipMemcpy(data, d_data,
-            sizeof(T) * N,
-            hipMemcpyDeviceToHost);
-
-  ASSERT_TRUE(check_inclusive<Function>(data, ScanHIP<TypeParam>::data));
-  hipFree(data);
-}
-
-GPU_TYPED_TEST_P(ScanHIP, exclusive)
-{
-  using T = typename Info<TypeParam>::data_type;
-  using Function = typename Info<TypeParam>::function;
-
-  T* out;
-  T* d_out;
-  out = (T*) malloc(sizeof(T) * N);
-  hipMalloc((void**)&d_out, sizeof(T) * N);
-
-  RAJA::exclusive_scan(typename Info<TypeParam>::exec(),
-                       ScanHIP<TypeParam>::d_data,
-                       ScanHIP<TypeParam>::d_data + N,
-                       d_out,
-                       Function{});
-
-  hipMemcpy(out, d_out,
-            sizeof(T) * N,
-            hipMemcpyDeviceToHost);
-
-  ASSERT_TRUE(check_exclusive<Function>(out, ScanHIP<TypeParam>::data));
-  free(out);
-  hipFree(d_out);
-}
-
-GPU_TYPED_TEST_P(ScanHIP, exclusive_inplace)
-{
-  using T = typename Info<TypeParam>::data_type;
-  using Function = typename Info<TypeParam>::function;
-
-  T* data;
-  T* d_data;
-  data = (T*) malloc(sizeof(T) * N);
-  hipMalloc((void**)&d_data, sizeof(T) * N);
-  std::copy_n(ScanHIP<TypeParam>::data, N, data);
-  hipMemcpy(d_data, data,
-            sizeof(T) * N,
-            hipMemcpyHostToDevice);
-
-  RAJA::exclusive_scan_inplace(typename Info<TypeParam>::exec(),
-                               d_data,
-                               d_data + N,
-                               Function{});
-
-  hipMemcpy(data, d_data,
-            sizeof(T) * N,
-            hipMemcpyDeviceToHost);
-
-  ASSERT_TRUE(check_exclusive<Function>(data, ScanHIP<TypeParam>::data));
-  free(data);
-  hipFree(d_data);
-}
-
-GPU_TYPED_TEST_P(ScanHIP, exclusive_offset)
-{
-  using T = typename Info<TypeParam>::data_type;
-  using Function = typename Info<TypeParam>::function;
-
-  T* out;
-  T* d_out;
-  out = (T*) malloc(sizeof(T) * N);
-  hipMalloc((void**)&d_out, sizeof(T) * N);
-
-  RAJA::exclusive_scan(typename Info<TypeParam>::exec(),
-                       ScanHIP<TypeParam>::d_data,
-                       ScanHIP<TypeParam>::d_data + N,
-                       d_out,
-                       Function{},
-                       T(2));
-
-  hipMemcpy(out, d_out,
-            sizeof(T) * N,
-            hipMemcpyDeviceToHost);
-
-  ASSERT_TRUE(check_exclusive<Function>(out, ScanHIP<TypeParam>::data, T(2)));
-  free(out);
-  hipFree(d_out);
-}
-
-GPU_TYPED_TEST_P(ScanHIP, exclusive_inplace_offset)
-{
-  using T = typename Info<TypeParam>::data_type;
-  using Function = typename Info<TypeParam>::function;
-
-  T* data;
-  T* d_data;
-  data = (T*) malloc(sizeof(T) * N);
-  hipMalloc((void**)&d_data, sizeof(T) * N);
-  std::copy_n(ScanHIP<TypeParam>::data, N, data);
-  hipMemcpy(d_data, data,
-            sizeof(T) * N,
-            hipMemcpyHostToDevice);
-
-  RAJA::exclusive_scan_inplace(
-      typename Info<TypeParam>::exec(), d_data, d_data + N, Function{}, T(2));
-
-  hipMemcpy(data, d_data,
-            sizeof(T) * N,
-            hipMemcpyDeviceToHost);
-
-  ASSERT_TRUE(check_exclusive<Function>(data, ScanHIP<TypeParam>::data, T(2)));
-  free(data);
-  hipFree(d_data);
-}
-
-REGISTER_TYPED_TEST_SUITE_P(ScanHIP,
-                            inclusive,
-                            inclusive_inplace,
-                            exclusive,
-                            exclusive_inplace,
-                            exclusive_offset,
-                            exclusive_inplace_offset);
-
-INSTANTIATE_TYPED_TEST_SUITE_P(ScanHIPTests, ScanHIP, CrossTypes);
diff --git a/test/unit/hip/test-synchronize.cpp b/test/unit/hip/test-synchronize.cpp
index 60b764787a..cdae124d08 100644
--- a/test/unit/hip/test-synchronize.cpp
+++ b/test/unit/hip/test-synchronize.cpp
@@ -11,7 +11,7 @@
 
 #include "RAJA_gtest.hpp"
 
-GPU_TEST(SynchronizeTest, HIP)
+GPU_TEST(SynchronizeUnitTest, HIP)
 {
 
   double* managed_data = (double*) malloc(sizeof(double)*50);
diff --git a/test/unit/index/CMakeLists.txt b/test/unit/index/CMakeLists.txt
new file mode 100644
index 0000000000..6fa5396744
--- /dev/null
+++ b/test/unit/index/CMakeLists.txt
@@ -0,0 +1,27 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+raja_add_test(
+  NAME test-indexset
+  SOURCES test-indexset.cpp)
+
+raja_add_test(
+  NAME test-indexvalue
+  SOURCES test-indexvalue.cpp)
+
+raja_add_test(
+  NAME test-listsegment
+  SOURCES test-listsegment.cpp)
+
+raja_add_test(
+  NAME test-rangesegment
+  SOURCES test-rangesegment.cpp)
+
+raja_add_test(
+  NAME test-rangestridesegment
+  SOURCES test-rangestridesegment.cpp)
+
diff --git a/test/unit/index/test-indexset.cpp b/test/unit/index/test-indexset.cpp
new file mode 100644
index 0000000000..f570ba03bb
--- /dev/null
+++ b/test/unit/index/test-indexset.cpp
@@ -0,0 +1,226 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing unit tests for IndexSet class.
+///
+
+#include "RAJA_test-base.hpp"
+
+#include "camp/resource.hpp"
+
+//
+// Resource object used to construct list segment objects with indices
+// living in host (CPU) memory. Used in all tests.
+//
+  camp::resources::Resource host_res{camp::resources::Host()};
+
+
+TEST(IndexSetUnitTest, Empty)
+{
+  RAJA::TypedIndexSet<> is;
+  ASSERT_EQ(0, is.size());
+  ASSERT_EQ(is.begin(), is.end());
+
+  RAJA::TypedIndexSet<> is2;
+  ASSERT_EQ(is2.size(), is.size());
+  is.swap(is2);
+  ASSERT_EQ(is2.size(), is.size());
+}
+
+TEST(IndexSetUnitTest, ConstructAndCompareSegments)
+{
+  using RangeSegType = RAJA::TypedRangeSegment<int>;
+  using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
+  RIndexSetType isr;
+  ASSERT_EQ((size_t)1, isr.getNumTypes());
+  isr.push_back(RangeSegType(1, 3));
+  isr.push_front(RangeSegType(0, 1));
+  ASSERT_EQ(2, isr.size()); 
+  ASSERT_EQ(size_t(3), isr.getLength());
+  const RangeSegType& rs0 = isr.getSegment<const RangeSegType>(0);
+  const RangeSegType& rs1 = isr.getSegment<const RangeSegType>(1);
+  ASSERT_EQ(1, rs0.size());
+  ASSERT_EQ(2, rs1.size());
+  ASSERT_TRUE(isr.compareSegmentById(0, isr));
+  ASSERT_TRUE(isr.compareSegmentById(1, isr));
+
+  RIndexSetType isr2;
+  isr2.push_back(RangeSegType(0, 3));
+  ASSERT_TRUE(isr != isr2);
+  ASSERT_FALSE(isr == isr2);
+  ASSERT_NE(isr.size(), isr2.size());
+  ASSERT_EQ(isr.getLength(), isr2.getLength());
+
+  using ListSegType = RAJA::TypedListSegment<int>; 
+  using RLIndexSetType = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
+  RLIndexSetType isrl;
+  ASSERT_EQ(size_t(2), isrl.getNumTypes());
+  int idx[ ] = {0, 2, 4, 5};
+  ListSegType lseg(idx, 4, host_res); 
+  isrl.push_back(lseg);
+  isrl.push_back(RangeSegType(6, 8));
+  ASSERT_EQ(2, isrl.size()); 
+  ASSERT_EQ(size_t(6), isrl.getLength());
+  const ListSegType ls0 = isrl.getSegment<const ListSegType>(0);
+  const RangeSegType rs11 = isrl.getSegment<const RangeSegType>(1);
+  ASSERT_EQ(4, ls0.size());
+  ASSERT_EQ(2, rs11.size());
+
+  ASSERT_FALSE(isrl.compareSegmentById(0, isr));
+  ASSERT_FALSE(isr.compareSegmentById(1, isrl));
+
+  RIndexSetType isr3(isr);
+  RLIndexSetType isrl3 = isrl;
+  ASSERT_TRUE(isr == isr3);
+  ASSERT_FALSE(isrl != isrl3);
+  ASSERT_FALSE(isr3 == isrl3);
+  ASSERT_TRUE(isr3 != isrl3);
+}
+
+TEST(IndexSetUnitTest, Swap)
+{
+  using RangeSegType = RAJA::TypedRangeSegment<int>;
+  using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
+  RIndexSetType iset1;
+  RangeSegType range(0, 10);
+  iset1.push_back(range);
+  iset1.push_back_nocopy(&range);
+  iset1.push_front(range);
+  iset1.push_front_nocopy(&range);
+  RIndexSetType iset2;
+
+  ASSERT_EQ(4, iset1.size());
+  ASSERT_EQ(size_t(40), iset1.getLength());
+  ASSERT_EQ(0, iset2.size());
+  ASSERT_EQ(size_t(0), iset2.getLength());
+
+  iset1.swap(iset2);
+
+  ASSERT_EQ(4, iset2.size());
+  ASSERT_EQ(size_t(40), iset2.getLength());
+  ASSERT_EQ(0, iset1.size());
+  ASSERT_EQ(size_t(0), iset1.getLength());
+}
+
+TEST(IndexSetUnitTest, Slice)
+{
+  using RangeSegType = RAJA::TypedRangeSegment<int>;
+  using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
+  RIndexSetType iset1;
+  RangeSegType range1(0, 2);
+  RangeSegType range2(2, 4);
+  RangeSegType range3(4, 6);
+  RangeSegType range4(6, 8);
+  RangeSegType range5(8, 10);
+  iset1.push_back(range1);
+  iset1.push_back(range2);
+  iset1.push_back(range3);
+  iset1.push_back(range4);
+  iset1.push_back(range5);
+  ASSERT_EQ(5, iset1.size());
+  ASSERT_EQ(size_t(10), iset1.getLength());
+
+  RIndexSetType iset2 = iset1.createSlice(2, 5);
+  ASSERT_EQ(3, iset2.size());
+  ASSERT_EQ(size_t(6), iset2.getLength());
+  const RangeSegType rs20 = iset2.getSegment<const RangeSegType>(0);
+  ASSERT_EQ(4, *rs20.begin());
+  ASSERT_EQ(6, *rs20.end());
+  const RangeSegType rs21 = iset2.getSegment<const RangeSegType>(1);
+  ASSERT_EQ(6, *rs21.begin());
+  ASSERT_EQ(8, *rs21.end());
+  const RangeSegType rs22 = iset2.getSegment<const RangeSegType>(2);
+  ASSERT_EQ(8, *rs22.begin());
+  ASSERT_EQ(10, *rs22.end());
+
+  int segs[ ] = {0, 3};
+  RIndexSetType iset3 = iset1.createSlice(segs, 2);
+  ASSERT_EQ(2, iset3.size());
+  ASSERT_EQ(size_t(4), iset3.getLength());
+  const RangeSegType rs30 = iset3.getSegment<const RangeSegType>(0);
+  ASSERT_EQ(0, *rs30.begin());
+  ASSERT_EQ(2, *rs30.end());
+  const RangeSegType rs31 = iset3.getSegment<const RangeSegType>(1);
+  ASSERT_EQ(6, *rs31.begin());
+  ASSERT_EQ(8, *rs31.end());
+
+  std::vector<int> segvec;
+  segvec.push_back(3);
+  segvec.push_back(2);
+  RIndexSetType iset4 = iset1.createSlice(segvec);
+  ASSERT_EQ(2, iset4.size());
+  ASSERT_EQ(size_t(4), iset4.getLength());
+  const RangeSegType rs40 = iset4.getSegment<const RangeSegType>(0);
+  ASSERT_EQ(6, *rs40.begin());
+  ASSERT_EQ(8, *rs40.end());
+  const RangeSegType rs41 = iset4.getSegment<const RangeSegType>(1);
+  ASSERT_EQ(4, *rs41.begin());
+  ASSERT_EQ(6, *rs41.end());
+}
+
+TEST(IndexSetUnitTest, ConditionalEvenIndices)
+{
+  using RangeSegType = RAJA::TypedRangeSegment<int>;
+  using ListSegType = RAJA::TypedListSegment<int>; 
+  using RLIndexSetType = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
+  RLIndexSetType iset;
+
+  iset.push_back(RangeSegType(0, 6));
+  int idx[ ] = {7, 8, 10, 11};
+  ListSegType lseg(idx, 4, host_res); 
+  iset.push_back(lseg);
+  iset.push_back(RangeSegType(13, 17));
+
+  RAJA::RAJAVec<int> ref_even_indices;
+  ref_even_indices.push_back(0); 
+  ref_even_indices.push_back(2);
+  ref_even_indices.push_back(4);
+  ref_even_indices.push_back(8);
+  ref_even_indices.push_back(10);
+  ref_even_indices.push_back(14);
+  ref_even_indices.push_back(16);
+
+  RAJA::RAJAVec<int> even_indices;
+  getIndicesConditional(even_indices, iset, [] (int idx) {
+    return !(idx % 2);
+  });
+
+  EXPECT_EQ(even_indices.size(), ref_even_indices.size());
+  for (size_t i = 0; i < ref_even_indices.size(); ++i) {
+    EXPECT_EQ(even_indices[i], ref_even_indices[i]);
+  }
+}
+
+TEST(IndexSetUnitTest, ConditionalLessThan100Indices)
+{
+  using RangeSegType = RAJA::TypedRangeSegment<int>;
+  using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
+  RIndexSetType iset;
+
+  iset.push_back(RangeSegType(92, 97));
+  iset.push_back(RangeSegType(98, 103));
+
+  RAJA::RAJAVec<int> ref_lt100_indices;
+  ref_lt100_indices.push_back(92);
+  ref_lt100_indices.push_back(93);
+  ref_lt100_indices.push_back(94);
+  ref_lt100_indices.push_back(95);
+  ref_lt100_indices.push_back(96);
+  ref_lt100_indices.push_back(98);
+  ref_lt100_indices.push_back(99);
+
+  RAJA::RAJAVec<int> lt100_indices;
+  getIndicesConditional(lt100_indices, iset, [] (int idx) {
+    return (idx < 100);
+  });
+
+  EXPECT_EQ(lt100_indices.size(), ref_lt100_indices.size());
+  for (size_t i = 0; i < ref_lt100_indices.size(); ++i) {
+    EXPECT_EQ(lt100_indices[i], ref_lt100_indices[i]);
+  }
+}
diff --git a/test/unit/index/test-indexvalue.cpp b/test/unit/index/test-indexvalue.cpp
new file mode 100644
index 0000000000..eff3c0042b
--- /dev/null
+++ b/test/unit/index/test-indexvalue.cpp
@@ -0,0 +1,280 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing unit tests for IndexValue
+///
+
+#include "RAJA_test-base.hpp"
+
+#include "RAJA_unit-test-types.hpp"
+
+template<typename T>
+class IndexValueUnitTest : public ::testing::Test {};
+
+TYPED_TEST_SUITE(IndexValueUnitTest, UnitIndexTypes);
+
+
+RAJA_INDEX_VALUE(StrongTypeIndex, "Strong Type")
+
+TYPED_TEST(IndexValueUnitTest, Construct)
+{
+  StrongTypeIndex a;
+  ASSERT_EQ(0l, *a);
+  const StrongTypeIndex b(5);
+  ASSERT_EQ(5l, *b);
+  ASSERT_EQ(std::string("Strong Type"), StrongTypeIndex::getName());
+
+  RAJA_INDEX_VALUE_T(TestType, TypeParam, "Test Type");
+  TestType c;
+  ASSERT_EQ((TypeParam)0, *c);
+  const TestType d(5);
+  ASSERT_EQ((TypeParam)5, *d);
+  ASSERT_EQ(std::string("Test Type"), TestType::getName());
+}
+
+TYPED_TEST(IndexValueUnitTest, PrePostIncrement)
+{
+  StrongTypeIndex a;
+  ASSERT_EQ(0l, *a++);
+  ASSERT_EQ(1l, *a);
+  ASSERT_EQ(2l, *++a);
+  ASSERT_EQ(2l, *a);
+
+  RAJA_INDEX_VALUE_T(TestType, TypeParam, "Test Type");
+  TestType b;
+  ASSERT_EQ((TypeParam)0, *b++);
+  ASSERT_EQ((TypeParam)1, *b);
+  ASSERT_EQ((TypeParam)2, *++b);
+  ASSERT_EQ((TypeParam)2, *b);
+}
+
+TYPED_TEST(IndexValueUnitTest, PrePostDecrement)
+{
+  StrongTypeIndex a(3);
+  ASSERT_EQ(3l, *a--);
+  ASSERT_EQ(2l, *a);
+  ASSERT_EQ(1l, *--a);
+  ASSERT_EQ(1l, *a);
+
+  RAJA_INDEX_VALUE_T(TestType, TypeParam, "Test Type");
+  TestType b(3);
+  ASSERT_EQ((TypeParam)3, *b--);
+  ASSERT_EQ((TypeParam)2, *b);
+  ASSERT_EQ((TypeParam)1, *--b);
+  ASSERT_EQ((TypeParam)1, *b);
+}
+
+TYPED_TEST(IndexValueUnitTest, StrongTypesArith)
+{
+  StrongTypeIndex a(8);
+  StrongTypeIndex b(2);
+
+  ASSERT_EQ(StrongTypeIndex(10), a + b);
+  ASSERT_EQ(StrongTypeIndex(8), a);
+  ASSERT_EQ(StrongTypeIndex(2), b);
+
+  ASSERT_EQ(StrongTypeIndex(6), a - b);
+  ASSERT_EQ(StrongTypeIndex(8), a);
+  ASSERT_EQ(StrongTypeIndex(2), b);
+
+  ASSERT_EQ(StrongTypeIndex(16), a * b);
+  ASSERT_EQ(StrongTypeIndex(8), a);
+  ASSERT_EQ(StrongTypeIndex(2), b);
+
+  ASSERT_EQ(StrongTypeIndex(4), a / b);
+  ASSERT_EQ(StrongTypeIndex(8), a);
+  ASSERT_EQ(StrongTypeIndex(2), b);
+
+  a += b;
+  ASSERT_EQ(StrongTypeIndex(10), a);
+  ASSERT_EQ(StrongTypeIndex(2), b);
+
+  a -= b;
+  ASSERT_EQ(StrongTypeIndex(8), a);
+  ASSERT_EQ(StrongTypeIndex(2), b);
+
+  a *= b;
+  ASSERT_EQ(StrongTypeIndex(16), a);
+  ASSERT_EQ(StrongTypeIndex(2), b);
+
+  a /= b;
+  ASSERT_EQ(StrongTypeIndex(8), a);
+  ASSERT_EQ(StrongTypeIndex(2), b);
+
+
+  RAJA_INDEX_VALUE_T(TestType, TypeParam, "Test Type");
+  TestType c(8);
+  TestType d(2);
+
+  ASSERT_EQ(TestType(10), c + d);
+  ASSERT_EQ(TestType(8), c);
+  ASSERT_EQ(TestType(2), d);
+
+  ASSERT_EQ(TestType(6), c - d);
+  ASSERT_EQ(TestType(8), c);
+  ASSERT_EQ(TestType(2), d);
+
+  ASSERT_EQ(TestType(16), c * d);
+  ASSERT_EQ(TestType(8), c);
+  ASSERT_EQ(TestType(2), d);
+
+  ASSERT_EQ(TestType(4), c / d);
+  ASSERT_EQ(TestType(8), c);
+  ASSERT_EQ(TestType(2), d);
+
+  c += d;
+  ASSERT_EQ(TestType(10), c);
+  ASSERT_EQ(TestType(2), d);
+
+  c -= d;
+  ASSERT_EQ(TestType(8), c);
+  ASSERT_EQ(TestType(2), d);
+
+  c *= d;
+  ASSERT_EQ(TestType(16), c);
+  ASSERT_EQ(TestType(2), d);
+
+  c /= d;
+  ASSERT_EQ(TestType(8), c);
+  ASSERT_EQ(TestType(2), d);
+}
+
+TYPED_TEST(IndexValueUnitTest, IndexTypeArith)
+{
+  StrongTypeIndex a(8);
+  RAJA::Index_type b(2);
+
+  ASSERT_EQ(StrongTypeIndex(10), a + b);
+  ASSERT_EQ(StrongTypeIndex(8), a);
+  ASSERT_EQ(StrongTypeIndex(2), b);
+
+  ASSERT_EQ(StrongTypeIndex(6), a - b);
+  ASSERT_EQ(StrongTypeIndex(8), a);
+  ASSERT_EQ(RAJA::Index_type(2), b);
+
+  ASSERT_EQ(StrongTypeIndex(16), a * b);
+  ASSERT_EQ(StrongTypeIndex(8), a);
+  ASSERT_EQ(RAJA::Index_type(2), b);
+
+  ASSERT_EQ(StrongTypeIndex(4), a / b);
+  ASSERT_EQ(StrongTypeIndex(8), a);
+  ASSERT_EQ(RAJA::Index_type(2), b);
+
+  a += b;
+  ASSERT_EQ(StrongTypeIndex(10), a);
+  ASSERT_EQ(RAJA::Index_type(2), b);
+
+  a -= b;
+  ASSERT_EQ(StrongTypeIndex(8), a);
+  ASSERT_EQ(RAJA::Index_type(2), b);
+
+  a *= b;
+  ASSERT_EQ(StrongTypeIndex(16), a);
+  ASSERT_EQ(RAJA::Index_type(2), b);
+
+  a /= b;
+  ASSERT_EQ(StrongTypeIndex(8), a);
+  ASSERT_EQ(RAJA::Index_type(2), b);
+
+  
+  RAJA_INDEX_VALUE_T(TestType, TypeParam, "Test Type");
+  TestType c(8);
+  RAJA::Index_type d(2);
+
+  ASSERT_EQ(TestType(10), c + d);
+  ASSERT_EQ(TestType(8), c);
+  ASSERT_EQ(TestType(2), d);
+
+  ASSERT_EQ(TestType(6), c - d);
+  ASSERT_EQ(TestType(8), c);
+  ASSERT_EQ(RAJA::Index_type(2), d);
+
+  ASSERT_EQ(TestType(16), c * d);
+  ASSERT_EQ(TestType(8), c);
+  ASSERT_EQ(RAJA::Index_type(2), d);
+
+  ASSERT_EQ(TestType(4), c / d);
+  ASSERT_EQ(TestType(8), c);
+  ASSERT_EQ(RAJA::Index_type(2), d);
+
+  c += d;
+  ASSERT_EQ(TestType(10), c);
+  ASSERT_EQ(RAJA::Index_type(2), d);
+
+  c -= d;
+  ASSERT_EQ(TestType(8), c);
+  ASSERT_EQ(RAJA::Index_type(2), d);
+
+  c *= d;
+  ASSERT_EQ(TestType(16), c);
+  ASSERT_EQ(RAJA::Index_type(2), d);
+
+  c /= d;
+  ASSERT_EQ(TestType(8), c);
+  ASSERT_EQ(RAJA::Index_type(2), d);
+}
+
+TYPED_TEST(IndexValueUnitTest, StrongTypeCompare)
+{
+  StrongTypeIndex v1(5);
+  StrongTypeIndex v2(6);
+  ASSERT_LT(v1, v2);
+  ASSERT_LE(v1, v2);
+  ASSERT_LE(v1, v1);
+  ASSERT_EQ(v1, v1);
+  ASSERT_EQ(v2, v2);
+  ASSERT_GE(v1, v1);
+  ASSERT_GE(v2, v1);
+  ASSERT_GT(v2, v1);
+  ASSERT_NE(v1, v2);
+
+  RAJA_INDEX_VALUE_T(TestType, TypeParam, "Test Type");
+  TestType v3(5);
+  TestType v4(6);
+  ASSERT_LT(v3, v4);
+  ASSERT_LE(v3, v4);
+  ASSERT_LE(v3, v3);
+  ASSERT_EQ(v3, v3);
+  ASSERT_EQ(v4, v4);
+  ASSERT_GE(v3, v3);
+  ASSERT_GE(v4, v3);
+  ASSERT_GT(v4, v3);
+  ASSERT_NE(v3, v4);
+}
+
+TYPED_TEST(IndexValueUnitTest, IndexTypeCompare)
+{
+  StrongTypeIndex v(5);
+  RAJA::Index_type v_lower(4);
+  RAJA::Index_type v_higher(6);
+  RAJA::Index_type v_same(5);
+  ASSERT_LT(v, v_higher);
+  ASSERT_LE(v, v_higher);
+  ASSERT_LE(v, v_same);
+  ASSERT_EQ(v, v_same);
+  ASSERT_GE(v, v_same);
+  ASSERT_GE(v, v_lower);
+  ASSERT_GT(v, v_lower);
+  ASSERT_NE(v, v_lower);
+  ASSERT_NE(v, v_higher);
+
+  RAJA_INDEX_VALUE_T(TestType, TypeParam, "Test Type");
+  TestType x(5);
+  RAJA::Index_type x_lower(4);
+  RAJA::Index_type x_higher(6);
+  RAJA::Index_type x_same(5);
+  ASSERT_LT(x, x_higher);
+  ASSERT_LE(x, x_higher);
+  ASSERT_LE(x, x_same);
+  ASSERT_EQ(x, x_same);
+  ASSERT_GE(x, x_same);
+  ASSERT_GE(x, x_lower);
+  ASSERT_GT(x, x_lower);
+  ASSERT_NE(x, x_lower);
+  ASSERT_NE(x, x_higher);
+}
diff --git a/test/unit/index/test-listsegment.cpp b/test/unit/index/test-listsegment.cpp
new file mode 100644
index 0000000000..dc972e92ce
--- /dev/null
+++ b/test/unit/index/test-listsegment.cpp
@@ -0,0 +1,102 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing unit tests for ListSegment
+///
+
+#include "RAJA_test-base.hpp"
+
+#include "RAJA_unit-test-types.hpp"
+
+#include "camp/resource.hpp"
+
+#include <vector>
+
+template<typename T>
+class ListSegmentUnitTest : public ::testing::Test {};
+
+TYPED_TEST_SUITE(ListSegmentUnitTest, UnitIndexTypes);
+
+//
+// Resource object used to construct list segment objects with indices
+// living in host (CPU) memory. Used in all tests in this file. 
+//
+camp::resources::Resource host_res{camp::resources::Host()};
+
+
+TYPED_TEST(ListSegmentUnitTest, Constructors)
+{
+  std::vector<TypeParam> idx;
+  for (TypeParam i = 0; i < 5; ++i){
+    idx.push_back(i);
+  }
+
+  RAJA::TypedListSegment<TypeParam> list1( &idx[0], idx.size(), host_res);
+  RAJA::TypedListSegment<TypeParam> copied(list1);
+
+  ASSERT_EQ(list1, copied);
+
+  RAJA::TypedListSegment<TypeParam> moved(std::move(list1));
+
+  ASSERT_EQ(moved, copied);
+
+  RAJA::TypedListSegment<TypeParam> container(idx, host_res);
+
+  ASSERT_EQ(list1, container); 
+}
+
+TYPED_TEST(ListSegmentUnitTest, Swaps)
+{
+  std::vector<TypeParam> idx1;
+  std::vector<TypeParam> idx2;
+  for (TypeParam i = 0; i < 5; ++i){
+    idx1.push_back(i);
+    idx2.push_back(i+5);
+  }
+
+  RAJA::TypedListSegment<TypeParam> list1( idx1, host_res );
+  RAJA::TypedListSegment<TypeParam> list2( idx2, host_res );
+  auto list3 = RAJA::TypedListSegment<TypeParam>(list1);
+  auto list4 = RAJA::TypedListSegment<TypeParam>(list2);
+
+  list1.swap(list2);
+
+  ASSERT_EQ(list2, list3);
+  ASSERT_EQ(list1, list4);
+
+  std::swap(list1, list2);
+
+  ASSERT_EQ(list1, list3);
+  ASSERT_EQ(list2, list4);
+}
+
+TYPED_TEST(ListSegmentUnitTest, Equality)
+{
+  std::vector<TypeParam> idx1{5,3,1,2};
+  RAJA::TypedListSegment<TypeParam> list( idx1, host_res );
+
+  std::vector<TypeParam> idx2{2,1,3,5};
+  
+  ASSERT_EQ(list.indicesEqual( &idx2.begin()[0], idx2.size() ), false);
+
+  std::reverse( idx2.begin(), idx2.end() );
+
+  ASSERT_EQ(list.indicesEqual( &idx2.begin()[0], idx2.size() ), true);
+}
+
+TYPED_TEST(ListSegmentUnitTest, Iterators)
+{
+  std::vector<TypeParam> idx1{5,3,1,2};
+  RAJA::TypedListSegment<TypeParam> list( idx1, host_res );
+
+  ASSERT_EQ(TypeParam(5), *list.begin());
+  ASSERT_EQ(TypeParam(2), *(list.end()-1));
+
+  ASSERT_EQ(4, list.size());
+}
+
diff --git a/test/unit/index/test-rangesegment.cpp b/test/unit/index/test-rangesegment.cpp
new file mode 100644
index 0000000000..4eb211cd1e
--- /dev/null
+++ b/test/unit/index/test-rangesegment.cpp
@@ -0,0 +1,122 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing unit tests for RangeSegment
+///
+
+#include "RAJA_test-base.hpp"
+
+#include "RAJA_unit-test-types.hpp"
+
+template<typename T>
+class RangeSegmentUnitTest : public ::testing::Test {};
+
+TYPED_TEST_SUITE(RangeSegmentUnitTest, UnitIndexTypes);
+
+
+template< typename T, typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+void NegativeRangeSegConstructorsTest()
+{
+}
+
+template< typename T, typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+void NegativeRangeSegConstructorsTest()
+{
+  RAJA::TypedRangeSegment<T> r1(-10, 7);
+  RAJA::TypedRangeSegment<T> r3(-13, -1);
+  ASSERT_EQ(17, r1.size());
+  ASSERT_EQ(12, r3.size());
+  // Test clamping when begin > end
+  RAJA::TypedRangeSegment<T> smaller(T(0), T(-50));
+  ASSERT_EQ(smaller.begin(), smaller.end());
+}
+
+TYPED_TEST(RangeSegmentUnitTest, Constructors)
+{
+  RAJA::TypedRangeSegment<TypeParam> first(0, 10);
+  RAJA::TypedRangeSegment<TypeParam> copied(first);
+
+  ASSERT_EQ(first, copied);
+
+  RAJA::TypedRangeSegment<TypeParam> moved(std::move(first));
+
+  ASSERT_EQ(moved, copied);
+
+  // Test clamping when begin > end
+  RAJA::TypedRangeSegment<TypeParam> smaller(20, 19);
+  ASSERT_EQ(smaller.begin(), smaller.end());
+
+  NegativeRangeSegConstructorsTest<TypeParam>();
+}
+
+TYPED_TEST(RangeSegmentUnitTest, Assignments)
+{
+  auto r = RAJA::TypedRangeSegment<TypeParam>(RAJA::Index_type(), 5);
+  RAJA::TypedRangeSegment<TypeParam> seg1 = r;
+  ASSERT_EQ(r, seg1);
+  RAJA::TypedRangeSegment<TypeParam> seg2 = std::move(r);
+  ASSERT_EQ(seg2, seg1);
+}
+
+TYPED_TEST(RangeSegmentUnitTest, Swaps)
+{
+  RAJA::TypedRangeSegment<TypeParam> r1(0, 5);
+  RAJA::TypedRangeSegment<TypeParam> r2(1, 6);
+  RAJA::TypedRangeSegment<TypeParam> r3(r1);
+  RAJA::TypedRangeSegment<TypeParam> r4(r2);
+  std::swap(r1, r2);
+  ASSERT_EQ(r1, r4);
+  ASSERT_EQ(r2, r3);
+}
+
+template< typename T, typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+void NegativeRangeSegIteratorsTest()
+{
+}
+
+template< typename T, typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+void NegativeRangeSegIteratorsTest()
+{
+  RAJA::TypedRangeSegment<T> r3(-2, 100);
+  ASSERT_EQ(T(-2), *r3.begin());
+}
+
+TYPED_TEST(RangeSegmentUnitTest, Iterators)
+{
+  RAJA::TypedRangeSegment<TypeParam> r1(0, 100);
+  ASSERT_EQ(TypeParam(0), *r1.begin());
+  ASSERT_EQ(TypeParam(99), *(--r1.end()));
+  ASSERT_EQ(TypeParam(100), r1.end() - r1.begin());
+  using difftype_t = decltype(std::distance(r1.begin(), r1.end()));
+  ASSERT_EQ(difftype_t(100), std::distance(r1.begin(), r1.end()));
+  ASSERT_EQ(difftype_t(100), r1.size());
+
+  NegativeRangeSegIteratorsTest<TypeParam>();
+}
+
+TYPED_TEST(RangeSegmentUnitTest, Slices)
+{
+  auto r = RAJA::TypedRangeSegment<TypeParam>(0, 125);
+  
+  auto s = r.slice(10,100);
+
+  ASSERT_EQ(TypeParam(10), *s.begin());
+  ASSERT_EQ(TypeParam(110), *(s.end()));
+}
+
+TYPED_TEST(RangeSegmentUnitTest, Equality)
+{
+  auto r1 = RAJA::TypedRangeSegment<TypeParam>(0, 125);
+  auto r2 = RAJA::TypedRangeSegment<TypeParam>(0, 125);
+
+  ASSERT_EQ(r1, r2);
+
+  auto r3 = RAJA::TypedRangeSegment<TypeParam>(10,15);
+
+  ASSERT_NE(r1, r3);
+}
diff --git a/test/unit/index/test-rangestridesegment.cpp b/test/unit/index/test-rangestridesegment.cpp
new file mode 100644
index 0000000000..a21a5b5e82
--- /dev/null
+++ b/test/unit/index/test-rangestridesegment.cpp
@@ -0,0 +1,158 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing unit tests for RangeSTrideSegment
+///
+
+#include "RAJA_test-base.hpp"
+
+#include "RAJA_unit-test-types.hpp"
+
+template<typename T>
+class RangeStrideSegmentUnitTest : public ::testing::Test {};
+
+TYPED_TEST_SUITE(RangeStrideSegmentUnitTest, UnitIndexTypes);
+
+
+TYPED_TEST(RangeStrideSegmentUnitTest, Constructors)
+{
+    RAJA::TypedRangeStrideSegment<TypeParam> first(0, 10, 2);
+    RAJA::TypedRangeStrideSegment<TypeParam> copied(first);
+    ASSERT_EQ(first, copied);
+    RAJA::TypedRangeStrideSegment<TypeParam> moved(std::move(first));
+    ASSERT_EQ(moved, copied);
+}
+
+TYPED_TEST(RangeStrideSegmentUnitTest, Assignments)
+{
+    auto r = RAJA::make_strided_range<TypeParam>(static_cast<TypeParam>(0), 
+                                                 static_cast<TypeParam>(5), 
+                                                 static_cast<typename std::make_signed<TypeParam>::type>(3));
+    RAJA::TypedRangeStrideSegment<TypeParam> seg1 = r;
+    ASSERT_EQ(r, seg1);
+    RAJA::TypedRangeStrideSegment<TypeParam> seg2 = std::move(r);
+    ASSERT_EQ(seg2, seg1);
+}
+
+TYPED_TEST(RangeStrideSegmentUnitTest, Swaps)
+{
+    RAJA::TypedRangeStrideSegment<TypeParam> r1(0, 5, 2);
+    RAJA::TypedRangeStrideSegment<TypeParam> r2(1, 6, 1);
+    RAJA::TypedRangeStrideSegment<TypeParam> r3(r1);
+    RAJA::TypedRangeStrideSegment<TypeParam> r4(r2);
+    std::swap(r1, r2);
+    ASSERT_EQ(r1, r4);
+    ASSERT_EQ(r2, r3);
+}
+
+TYPED_TEST(RangeStrideSegmentUnitTest, Iterators)
+{
+    RAJA::TypedRangeStrideSegment<TypeParam> r1(0, 100, 4);
+    ASSERT_EQ(TypeParam(0), *r1.begin());
+    ASSERT_EQ(TypeParam(96), *(--r1.end()));
+    using difftype_t = decltype(std::distance(r1.begin(), r1.end()));
+    ASSERT_EQ(difftype_t(25), r1.end() - r1.begin());
+    ASSERT_EQ(difftype_t(25), std::distance(r1.begin(), r1.end()));
+    ASSERT_EQ(difftype_t(25), r1.size());
+}
+
+template< typename T, typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+void NegativeRangeStrideTestSizes()
+{
+}
+
+template< typename T, typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+void NegativeRangeStrideTestSizes()
+{
+  RAJA::TypedRangeStrideSegment<T> segment16(-10, -2, 2);
+  using difftype_t = decltype(std::distance(segment16.begin(), segment16.end()));
+  ASSERT_EQ(segment16.size(), difftype_t(4));
+
+  RAJA::TypedRangeStrideSegment<T> segment17(-5, 5, 2);
+  ASSERT_EQ(segment17.size(), difftype_t(5));
+
+  RAJA::TypedRangeStrideSegment<T> segment18(0, -5, 1);
+  ASSERT_EQ(segment18.size(), difftype_t(0));
+}
+
+TYPED_TEST(RangeStrideSegmentUnitTest, Sizes)
+{
+  RAJA::TypedRangeStrideSegment<TypeParam> segment1(0, 20, 1);
+  using difftype_t = decltype(std::distance(segment1.begin(), segment1.end()));
+  ASSERT_EQ(segment1.size(), difftype_t(20));
+
+  RAJA::TypedRangeStrideSegment<TypeParam> segment2(0, 20, 2);
+  ASSERT_EQ(segment2.size(), difftype_t(10));
+
+  RAJA::TypedRangeStrideSegment<TypeParam> segment3(0, 20, 4);
+  ASSERT_EQ(segment3.size(), difftype_t(5));
+
+  RAJA::TypedRangeStrideSegment<TypeParam> segment4(0, 20, 5);
+  ASSERT_EQ(segment4.size(), difftype_t(4));
+
+  RAJA::TypedRangeStrideSegment<TypeParam> segment5(0, 20, 10);
+  ASSERT_EQ(segment5.size(), difftype_t(2));
+
+  RAJA::TypedRangeStrideSegment<TypeParam> segment6(0, 20, 20);
+  ASSERT_EQ(segment6.size(), difftype_t(1));
+
+  // ROUNDOFFS
+  RAJA::TypedRangeStrideSegment<TypeParam> segment7(0, 21, 2);
+  ASSERT_EQ(segment7.size(), difftype_t(11));
+
+  RAJA::TypedRangeStrideSegment<TypeParam> segment8(0, 21, 4);
+  ASSERT_EQ(segment8.size(), difftype_t(6));
+
+  RAJA::TypedRangeStrideSegment<TypeParam> segment9(0, 21, 5);
+  ASSERT_EQ(segment9.size(), difftype_t(5));
+
+  RAJA::TypedRangeStrideSegment<TypeParam> segment10(0, 21, 10);
+  ASSERT_EQ(segment10.size(), difftype_t(3));
+
+  RAJA::TypedRangeStrideSegment<TypeParam> segment11(0, 21, 20);
+  ASSERT_EQ(segment11.size(), difftype_t(2));
+
+  // PRIMES
+  RAJA::TypedRangeStrideSegment<TypeParam> segment12(0, 7, 3);  // should produce 0,3,6
+  ASSERT_EQ(segment12.size(), difftype_t(3));
+
+  RAJA::TypedRangeStrideSegment<TypeParam> segment13(0, 13, 3);  // should produce 0,3,6,9,12
+  ASSERT_EQ(segment13.size(), difftype_t(5));
+
+  RAJA::TypedRangeStrideSegment<TypeParam> segment14(0, 17, 5);  // should produce 0,5,10,15
+  ASSERT_EQ(segment14.size(), difftype_t(4));
+
+  // NEGATIVE STRIDE
+  RAJA::TypedRangeStrideSegment<TypeParam> segment15(0, 20, -2);
+  ASSERT_EQ(segment15.size(), difftype_t(0));
+
+  // NEGATIVE INDICES
+  NegativeRangeStrideTestSizes<TypeParam>();
+}
+
+TYPED_TEST(RangeStrideSegmentUnitTest, Slices)
+{
+  auto r = RAJA::TypedRangeStrideSegment<TypeParam>(0, 20, 2);
+  auto s = r.slice(0, 5);
+
+  ASSERT_EQ(size_t(5), size_t(s.size()));
+  ASSERT_EQ(TypeParam(0), *s.begin());
+  ASSERT_EQ(TypeParam(10), *s.end());
+}
+
+TYPED_TEST(RangeStrideSegmentUnitTest, Equality)
+{
+  auto r1 = RAJA::TypedRangeStrideSegment<TypeParam>(0, 10, 1);
+  auto r2 = RAJA::TypedRangeStrideSegment<TypeParam>(0, 10, 1);
+
+  ASSERT_EQ(r1, r2);
+
+  auto r3 = RAJA::TypedRangeStrideSegment<TypeParam>(1, 10, 1);
+
+  ASSERT_TRUE( !(r1 == r3));
+}
diff --git a/test/unit/internal/CMakeLists.txt b/test/unit/internal/CMakeLists.txt
new file mode 100644
index 0000000000..4e27ccdf35
--- /dev/null
+++ b/test/unit/internal/CMakeLists.txt
@@ -0,0 +1,15 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+raja_add_test(
+  NAME test-iterators
+  SOURCES test-iterators.cpp)
+
+raja_add_test(
+  NAME test-rajavec
+  SOURCES test-rajavec.cpp)
+
diff --git a/test/unit/internal/test-iterators.cpp b/test/unit/internal/test-iterators.cpp
new file mode 100644
index 0000000000..568249d25e
--- /dev/null
+++ b/test/unit/internal/test-iterators.cpp
@@ -0,0 +1,155 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing unit tests for numeric_iterator
+///
+
+#include "RAJA_test-base.hpp"
+#include "RAJA_unit-test-types.hpp"
+
+#include <limits>
+
+template<typename T>
+class NumericIteratorUnitTest : public ::testing::Test {};
+
+template<typename T>
+class StridedNumericIteratorUnitTest : public ::testing::Test {};
+
+TYPED_TEST_SUITE(NumericIteratorUnitTest, UnitExpandedIntegralTypes);
+TYPED_TEST_SUITE(StridedNumericIteratorUnitTest, UnitExpandedIntegralTypes);
+
+TYPED_TEST(NumericIteratorUnitTest, simple)
+{
+  RAJA::Iterators::numeric_iterator<TypeParam> i;
+  ASSERT_EQ(TypeParam(0), *i);
+  ++i;
+  ASSERT_EQ(TypeParam(1), *i);
+  --i;
+  ASSERT_EQ(TypeParam(0), *i);
+  ASSERT_EQ(TypeParam(0), *i++);
+  ASSERT_EQ(TypeParam(1), *i);
+  ASSERT_EQ(TypeParam(1), *i--);
+  ASSERT_EQ(TypeParam(0), *i);
+  i += 2;
+  ASSERT_EQ(TypeParam(2), *i);
+  i -= 1;
+  ASSERT_EQ(TypeParam(1), *i);
+  RAJA::Iterators::numeric_iterator<TypeParam> five(5);
+  i += five;
+  ASSERT_EQ(TypeParam(6), *i);
+  i -= five;
+  ASSERT_EQ(TypeParam(1), *i);
+  RAJA::Iterators::numeric_iterator<TypeParam> three(3);
+  ASSERT_LE(three, three);
+  ASSERT_LE(three, five);
+  ASSERT_LT(three, five);
+  ASSERT_GE(five, three);
+  ASSERT_GT(five, three);
+  ASSERT_NE(five, three);
+  ASSERT_EQ(three + 2, five);
+  ASSERT_EQ(2 + three, five);
+  ASSERT_EQ(five - 2, three);
+  ASSERT_EQ(8 - five, three);
+}
+
+TYPED_TEST(StridedNumericIteratorUnitTest, simple)
+{
+  RAJA::Iterators::strided_numeric_iterator<TypeParam> i(0, 2);
+  ASSERT_EQ(TypeParam(0), *i);
+  ++i;
+  ASSERT_EQ(TypeParam(2), *i);
+  --i;
+  ASSERT_EQ(TypeParam(0), *i);
+  i += 2;
+  ASSERT_EQ(TypeParam(4), *i);
+  i -= 1;
+  ASSERT_EQ(TypeParam(2), *i);
+  RAJA::Iterators::strided_numeric_iterator<TypeParam> three(3, 2);
+  RAJA::Iterators::strided_numeric_iterator<TypeParam> five(5, 2);
+  ASSERT_LE(three, three);
+  ASSERT_LE(three, five);
+  ASSERT_LT(three, five);
+  ASSERT_GE(five, three);
+  ASSERT_GT(five, three);
+  ASSERT_NE(five, three);
+  ASSERT_EQ(three + 1, five);
+  ASSERT_EQ(five - 1, three);
+}
+
+#if defined(ENABLE_ITERATOR_OVERFLOW_DEBUG)
+TYPED_TEST(NumericIteratorUnitTest, overflow)
+{
+  if (std::is_unsigned<TypeParam>::value) {
+    ASSERT_ANY_THROW({
+      TypeParam val = 10;
+      RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
+      of_it -= 11;
+    });
+    ASSERT_ANY_THROW({
+      TypeParam val = std::numeric_limits<TypeParam>::max() - 10;
+      RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
+      of_it += 11;
+    });
+  
+    ASSERT_ANY_THROW({
+      TypeParam val = 10;
+      RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
+      auto sum = of_it - 11u;
+      (void)sum;
+    });
+    ASSERT_ANY_THROW({
+      TypeParam val = std::numeric_limits<TypeParam>::max() - 10;
+      RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
+      auto sum = of_it + 11;
+      (void)sum;
+    });
+  
+    ASSERT_ANY_THROW({
+      TypeParam val = 10;
+      const RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
+      auto sum = 8 - of_it;
+      (void)sum;
+    });
+    ASSERT_ANY_THROW({
+      TypeParam val = std::numeric_limits<TypeParam>::max() - 10;
+      const RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
+      auto sum = 11 + of_it;
+      (void)sum;
+    });
+  } 
+}
+
+TYPED_TEST(StridedNumericIteratorUnitTest, overflow)
+{
+  if (std::is_unsigned<TypeParam>::value){
+    ASSERT_ANY_THROW({
+      TypeParam val = 2;
+      RAJA::Iterators::strided_numeric_iterator<TypeParam> of_it(val, 2);
+      of_it -= 2;
+    });
+    ASSERT_ANY_THROW({
+      TypeParam val = std::numeric_limits<TypeParam>::max() - 2;
+      RAJA::Iterators::strided_numeric_iterator<TypeParam> of_it(val, 2);
+      of_it += 2;
+    });
+
+    ASSERT_ANY_THROW({
+      TypeParam val = 2;
+      RAJA::Iterators::strided_numeric_iterator<TypeParam> of_it(val, 2);
+      auto sum = of_it - 2;
+      (void)sum;
+    });
+    ASSERT_ANY_THROW({
+      TypeParam val = std::numeric_limits<TypeParam>::max() - 2;
+      RAJA::Iterators::strided_numeric_iterator<TypeParam> of_it(val, 2);
+      auto sum = of_it + 2;
+      (void)sum;
+    });
+  }
+}
+#endif
diff --git a/test/unit/test-rajavec.cpp b/test/unit/internal/test-rajavec.cpp
similarity index 80%
rename from test/unit/test-rajavec.cpp
rename to test/unit/internal/test-rajavec.cpp
index cf248b7a1c..c296f4a732 100644
--- a/test/unit/test-rajavec.cpp
+++ b/test/unit/internal/test-rajavec.cpp
@@ -6,15 +6,15 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJAVec
+/// Source file containing unit tests for RAJAVec
 ///
 
-#include "RAJA/RAJA.hpp"
-#include "RAJA_gtest.hpp"
+#include "RAJA_test-base.hpp"
 
-TEST(RAJAVec, basic_test)
+TEST(RAJAVecUnitTest, basic_test)
 {
   RAJA::RAJAVec<int> a;
+
   ASSERT_TRUE(a.empty());
   ASSERT_EQ(0lu, a.size());
   a.push_back(5);
@@ -24,24 +24,37 @@ TEST(RAJAVec, basic_test)
   a.push_front(10);
   ASSERT_EQ(10, *a.begin());
   ASSERT_EQ(5, *(a.end() - 1));
+
+  RAJA::RAJAVec<int> a1(a);
+  ASSERT_EQ(a.size(), a1.size());
+  int* a_data = a.data(); 
+  int* a1_data = a1.data(); 
+  ASSERT_EQ(a_data[0], a1_data[0]);
+  ASSERT_EQ(a_data[1], a1_data[1]);
+
   a.resize(5, 20);
   ASSERT_EQ(20, a[2]);
   ASSERT_EQ(20, a[3]);
   ASSERT_EQ(20, a[4]);
   ASSERT_EQ(5lu, a.size());
+
   a.resize(1);
   ASSERT_EQ(1lu, a.size());
+
   a.resize(0);
   for (int i = 0; i < 100; ++i)
     a.push_back(i);
   ASSERT_EQ(100lu, a.size());
+
   auto b = a;
   b.resize(0);
   ASSERT_EQ(0lu, b.size());
   ASSERT_EQ(100lu, a.size());
+
   a.swap(b);
   ASSERT_EQ(0lu, a.size());
   ASSERT_EQ(100lu, b.size());
+
   RAJA::RAJAVec<int> c;
   for (int i = 0; i < 100; ++i)
     c.push_front(i);
diff --git a/test/unit/omp-target/test-reductions.cpp b/test/unit/omp-target/test-reductions.cpp
deleted file mode 100644
index edac19bfe0..0000000000
--- a/test/unit/omp-target/test-reductions.cpp
+++ /dev/null
@@ -1,327 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for RAJA CPU reduction operations.
-///
-
-#include "gtest/gtest.h"
-
-#include <iostream>
-#include "RAJA/RAJA.hpp"
-
-#include <tuple>
-
-template <typename T>
-class ReductionConstructorTestTargetOMP : public ::testing::Test
-{
-};
-
-TYPED_TEST_SUITE_P(ReductionConstructorTestTargetOMP);
-
-TYPED_TEST_P(ReductionConstructorTestTargetOMP, ReductionConstructor)
-{
-  using ReducePolicy = typename std::tuple_element<0, TypeParam>::type;
-  using NumericType = typename std::tuple_element<1, TypeParam>::type;
-
-  NumericType initVal = 5;
-
-  RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum(initVal);
-  RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min(initVal);
-  RAJA::ReduceMax<ReducePolicy, NumericType> reduce_max(initVal);
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType> reduce_minloc(initVal, 1);
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(initVal, 1);
-
-  RAJA::tuple<RAJA::Index_type, RAJA::Index_type> LocTup(1, 1);
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_minloctup(initVal, LocTup);
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_maxloctup(initVal, LocTup);
-
-  ASSERT_EQ((NumericType)reduce_sum.get(), (NumericType)(initVal));
-  ASSERT_EQ((NumericType)reduce_min.get(), (NumericType)(initVal));
-  ASSERT_EQ((NumericType)reduce_max.get(), (NumericType)(initVal));
-  ASSERT_EQ((NumericType)reduce_minloc.get(), (NumericType)(initVal));
-  ASSERT_EQ((RAJA::Index_type)reduce_minloc.getLoc(), (RAJA::Index_type)1);
-  ASSERT_EQ((NumericType)reduce_maxloc.get(), (NumericType)(initVal));
-  ASSERT_EQ((RAJA::Index_type)reduce_maxloc.getLoc(), (RAJA::Index_type)1);
-
-  ASSERT_EQ((NumericType)reduce_minloctup.get(), (NumericType)(initVal));
-  ASSERT_EQ((NumericType)reduce_maxloctup.get(), (NumericType)(initVal));
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())), (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())), (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())), (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())), (RAJA::Index_type)1);
-}
-
-REGISTER_TYPED_TEST_SUITE_P(ReductionConstructorTestTargetOMP,
-                            ReductionConstructor);
-
-using constructor_types =
-    ::testing::Types<std::tuple<RAJA::omp_target_reduce, int>,
-                     std::tuple<RAJA::omp_target_reduce, float>,
-                     std::tuple<RAJA::omp_target_reduce, double>>;
-
-
-INSTANTIATE_TYPED_TEST_SUITE_P(ReduceBasicTestsTargetOMP,
-                              ReductionConstructorTestTargetOMP,
-                              constructor_types);
-
-template <typename TUPLE>
-class ReductionCorrectnessTestTargetOMP : public ::testing::Test
-{
-protected:
-  virtual void SetUp()
-  {
-    array_length = 102;
-
-    array = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
-                                                array_length * sizeof(double));
-
-    for (int i = 1; i < array_length - 1; ++i) {
-      array[i] = (RAJA::Real_type)i;
-    }
-    array[0] = 0.0;
-    array[array_length - 1] = -1.0;
-#pragma omp target enter data map(to : array[:array_length])
-
-    sum = 0.0;
-    min = array_length * 2;
-    max = 0.0;
-    minloc = -1;
-    maxloc = -1;
-
-    for (int i = 0; i < array_length; ++i) {
-      RAJA::Real_type val = array[i];
-
-      sum += val;
-
-      if (val > max) {
-        max = val;
-        maxloc = i;
-      }
-
-      if (val < min) {
-        min = val;
-        minloc = i;
-      }
-    }
-  }
-
-  virtual void TearDown()
-  {
-#pragma omp target exit data map(release : array[:array_length])
-    free(array);
-  }
-
-  RAJA::Real_ptr array;
-
-  RAJA::Real_type max;
-  RAJA::Real_type min;
-  RAJA::Real_type sum;
-  RAJA::Real_type maxloc;
-  RAJA::Real_type minloc;
-
-  RAJA::Index_type array_length;
-};
-TYPED_TEST_SUITE_P(ReductionCorrectnessTestTargetOMP);
-
-TYPED_TEST_P(ReductionCorrectnessTestTargetOMP, ReduceSum)
-{
-  using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
-  using ReducePolicy = typename std::tuple_element<1, TypeParam>::type;
-
-  RAJA::ReduceSum<ReducePolicy, double> sum_reducer(0.0);
-
-  auto array = this->array;
-  RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
-                           [=](int i) { sum_reducer += array[i]; });
-
-  double raja_sum = (double)sum_reducer.get();
-
-  ASSERT_FLOAT_EQ(this->sum, raja_sum);
-}
-
-TYPED_TEST_P(ReductionCorrectnessTestTargetOMP, ReduceMin)
-{
-  using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
-  using ReducePolicy = typename std::tuple_element<1, TypeParam>::type;
-
-  RAJA::ReduceMin<ReducePolicy, double> min_reducer(1024.0);
-
-  auto array = this->array;
-  RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
-                           [=](int i) { min_reducer.min(array[i]); });
-
-  double raja_min = (double)min_reducer.get();
-
-  ASSERT_FLOAT_EQ(this->min, raja_min);
-}
-
-TYPED_TEST_P(ReductionCorrectnessTestTargetOMP, ReduceMax)
-{
-  using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
-  using ReducePolicy = typename std::tuple_element<1, TypeParam>::type;
-
-  RAJA::ReduceMax<ReducePolicy, double> max_reducer(0.0);
-
-  auto array = this->array;
-  RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
-                           [=](int i) { max_reducer.max(array[i]); });
-
-  double raja_max = (double)max_reducer.get();
-
-  ASSERT_FLOAT_EQ(this->max, raja_max);
-}
-
-TYPED_TEST_P(ReductionCorrectnessTestTargetOMP, ReduceMinLoc)
-{
-  using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
-  using ReducePolicy = typename std::tuple_element<1, TypeParam>::type;
-
-  RAJA::ReduceMinLoc<ReducePolicy, double> minloc_reducer(1024.0, 0);
-
-  auto array = this->array;
-  RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
-                           [=](int i) { minloc_reducer.minloc(array[i], i); });
-
-  double raja_min = (double)minloc_reducer.get();
-  RAJA::Index_type raja_loc = minloc_reducer.getLoc();
-
-  ASSERT_FLOAT_EQ(this->min, raja_min);
-  ASSERT_EQ(this->minloc, raja_loc);
-}
-
-TYPED_TEST_P(ReductionCorrectnessTestTargetOMP, ReduceMinLocGenericIndex)
-{
-  using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
-  using ReducePolicy = typename std::tuple_element<1, TypeParam>::type;
-
-  struct Index {
-     RAJA::Index_type idx;
-     Index() : idx(-1) {}
-     Index(RAJA::Index_type idx) : idx(idx) {}
-  };
-
-  RAJA::ReduceMinLoc<ReducePolicy, double, Index> minloc_reducer(1024.0, Index(0));
-
-  auto array = this->array;
-  // TODO: remove this when compilers (clang-coral and IBM XLC) are no longer
-  // broken for lambda capture
-#pragma omp target data use_device_ptr(array)
-  RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
-                           [=](int i) { minloc_reducer.minloc(array[i], Index(i)); });
-
-  double raja_min = (double)minloc_reducer.get();
-  Index raja_loc = minloc_reducer.getLoc();
-
-  ASSERT_FLOAT_EQ(this->min, raja_min);
-  ASSERT_EQ(this->minloc, raja_loc.idx);
-}
-
-TYPED_TEST_P(ReductionCorrectnessTestTargetOMP, ReduceMaxLoc)
-{
-  using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
-  using ReducePolicy = typename std::tuple_element<1, TypeParam>::type;
-
-  RAJA::ReduceMaxLoc<ReducePolicy, double> maxloc_reducer(0.0, -1);
-
-  auto array = this->array;
-  RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
-                           [=](int i) { maxloc_reducer.maxloc(array[i], i); });
-
-  double raja_max = (double)maxloc_reducer.get();
-  RAJA::Index_type raja_loc = maxloc_reducer.getLoc();
-
-  ASSERT_FLOAT_EQ(this->max, raja_max);
-  ASSERT_EQ(this->maxloc, raja_loc);
-}
-
-TYPED_TEST_P(ReductionCorrectnessTestTargetOMP, ReduceMaxLocGenericIndex)
-{
-  using ExecPolicy = typename std::tuple_element<0, TypeParam>::type;
-  using ReducePolicy = typename std::tuple_element<1, TypeParam>::type;
-
-  struct Index {
-     RAJA::Index_type idx;
-     Index() : idx(-1) {}
-     Index(RAJA::Index_type idx) : idx(idx) {}
-  };
-
-  RAJA::ReduceMaxLoc<ReducePolicy, double, Index> maxloc_reducer(0.0, Index());
-
-  auto array = this->array;
-  // TODO: remove this when compilers (clang-coral and IBM XLC) are no longer
-  // broken for lambda capture
-#pragma omp target data use_device_ptr(array)
-  RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, this->array_length),
-                           [=](int i) { maxloc_reducer.maxloc(array[i], Index(i)); });
-
-  double raja_max = (double)maxloc_reducer.get();
-  Index raja_loc = maxloc_reducer.getLoc();
-
-  ASSERT_FLOAT_EQ(this->max, raja_max);
-  ASSERT_EQ(this->maxloc, raja_loc.idx);
-}
-
-REGISTER_TYPED_TEST_SUITE_P(ReductionCorrectnessTestTargetOMP,
-                            ReduceSum,
-                            ReduceMin,
-                            ReduceMax,
-                            ReduceMinLoc,
-                            ReduceMinLocGenericIndex,
-                            ReduceMaxLoc,
-                            ReduceMaxLocGenericIndex);
-using types =
-    ::testing::Types<std::tuple<RAJA::omp_target_parallel_for_exec<16>,
-                                RAJA::omp_target_reduce>,
-                     std::tuple<RAJA::omp_target_parallel_for_exec<64>,
-                                RAJA::omp_target_reduce>,
-                     std::tuple<RAJA::omp_target_parallel_for_exec<256>,
-                                RAJA::omp_target_reduce>>;
-
-INSTANTIATE_TYPED_TEST_SUITE_P(Reduce, ReductionCorrectnessTestTargetOMP, types);
-
-template <typename TUPLE>
-class NestedReductionCorrectnessTestTargetOMP : public ::testing::Test
-{
-protected:
-  virtual void SetUp()
-  {
-    x_size = 256;
-    y_size = 256;
-    z_size = 256;
-
-    array = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
-                                                x_size * y_size * z_size *
-                                                    sizeof(double));
-
-    const double val = 4.0 / (x_size * y_size * z_size);
-
-    for (int i = 0; i < (x_size * y_size * z_size); ++i) {
-      array[i] = (RAJA::Real_type)val;
-    }
-
-#pragma omp target enter data map(to : array[:x_size * y_size * z_size])
-
-    sum = 4.0;
-  }
-
-  virtual void TearDown()
-  {
-#pragma omp target exit data map(release : array[:x_size * y_size * z_size])
-    free(array);
-  }
-
-  RAJA::Real_ptr array;
-
-  RAJA::Real_type sum;
-
-  RAJA::Index_type x_size;
-  RAJA::Index_type y_size;
-  RAJA::Index_type z_size;
-};
-
-
diff --git a/test/unit/reducer/CMakeLists.txt b/test/unit/reducer/CMakeLists.txt
new file mode 100644
index 0000000000..7f53be9822
--- /dev/null
+++ b/test/unit/reducer/CMakeLists.txt
@@ -0,0 +1,64 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+raja_add_test(
+  NAME test-reducer-constructors-seq
+  SOURCES test-reducer-constructors-seq.cpp)
+
+raja_add_test(
+  NAME test-reducer-reset-seq
+  SOURCES test-reducer-reset-seq.cpp)
+
+if(RAJA_ENABLE_TBB)
+raja_add_test(
+  NAME test-reducer-constructors-tbb
+  SOURCES test-reducer-constructors-tbb.cpp)
+
+raja_add_test(
+  NAME test-reducer-reset-tbb
+  SOURCES test-reducer-reset-tbb.cpp)
+endif()
+
+if(RAJA_ENABLE_OPENMP)
+raja_add_test(
+  NAME test-reducer-constructors-openmp
+  SOURCES test-reducer-constructors-openmp.cpp)
+
+raja_add_test(
+  NAME test-reducer-reset-openmp
+  SOURCES test-reducer-reset-openmp.cpp)
+endif()
+
+if(RAJA_ENABLE_TARGET_OPENMP)
+raja_add_test(
+  NAME test-reducer-constructors-openmp-target
+  SOURCES test-reducer-constructors-openmp-target.cpp)
+
+raja_add_test(
+  NAME test-reducer-reset-openmp-target
+  SOURCES test-reducer-reset-openmp-target.cpp)
+endif()
+
+if(RAJA_ENABLE_CUDA)
+raja_add_test(
+  NAME test-reducer-constructors-cuda
+  SOURCES test-reducer-constructors-cuda.cpp)
+
+raja_add_test(
+  NAME test-reducer-reset-cuda
+  SOURCES test-reducer-reset-cuda.cpp)
+endif()
+
+if(RAJA_ENABLE_HIP)
+raja_add_test(
+  NAME test-reducer-constructors-hip
+  SOURCES test-reducer-constructors-hip.cpp)
+
+raja_add_test(
+  NAME test-reducer-reset-hip
+  SOURCES test-reducer-reset-hip.cpp)
+endif()
diff --git a/test/unit/reducer/test-reducer-constructors-cuda.cpp b/test/unit/reducer/test-reducer-constructors-cuda.cpp
new file mode 100644
index 0000000000..359d15f0ec
--- /dev/null
+++ b/test/unit/reducer/test-reducer-constructors-cuda.cpp
@@ -0,0 +1,25 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA reducer constructors and initialization.
+///
+
+#include "tests/test-reducer-constructors.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+using CudaInitReducerConstructorTypes = 
+  Test< camp::cartesian_product< CudaReducerPolicyList,
+                                 DataTypeList,
+                                 CudaResourceList,
+                                 CudaForoneList > >::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(CudaInitTest,
+                               ReducerInitConstructorUnitTest,
+                               CudaInitReducerConstructorTypes);
+#endif
+
diff --git a/test/unit/reducer/test-reducer-constructors-hip.cpp b/test/unit/reducer/test-reducer-constructors-hip.cpp
new file mode 100644
index 0000000000..23c41aff26
--- /dev/null
+++ b/test/unit/reducer/test-reducer-constructors-hip.cpp
@@ -0,0 +1,25 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA reducer constructors and initialization.
+///
+
+#include "tests/test-reducer-constructors.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+using HipInitReducerConstructorTypes = 
+  Test< camp::cartesian_product< HipReducerPolicyList,
+                                 DataTypeList,
+                                 HipResourceList,
+                                 HipForoneList > >::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(HipInitTest,
+                               ReducerInitConstructorUnitTest,
+                               HipInitReducerConstructorTypes);
+#endif
+
diff --git a/test/unit/reducer/test-reducer-constructors-openmp-target.cpp b/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
new file mode 100644
index 0000000000..af662269e0
--- /dev/null
+++ b/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
@@ -0,0 +1,25 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA reducer constructors and initialization.
+///
+
+#include "tests/test-reducer-constructors.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+using OpenMPTargetInitReducerConstructorTypes = 
+  Test< camp::cartesian_product< OpenMPTargetReducerPolicyList,
+                                 DataTypeList,
+                                 OpenMPTargetResourceList,
+                                 SequentialForoneList > >::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPTargetInitTest,
+                               ReducerInitConstructorUnitTest,
+                               OpenMPTargetInitReducerConstructorTypes);
+#endif
+
diff --git a/test/unit/reducer/test-reducer-constructors-openmp.cpp b/test/unit/reducer/test-reducer-constructors-openmp.cpp
new file mode 100644
index 0000000000..b63384395a
--- /dev/null
+++ b/test/unit/reducer/test-reducer-constructors-openmp.cpp
@@ -0,0 +1,34 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA reducer constructors and initialization.
+///
+
+#include "tests/test-reducer-constructors.hpp"
+
+#if defined(RAJA_ENABLE_OPENMP)
+using OpenMPBasicReducerConstructorTypes = 
+  Test< camp::cartesian_product< OpenMPReducerPolicyList,
+                                 DataTypeList,
+                                 HostResourceList > >::Types;
+
+using OpenMPInitReducerConstructorTypes = 
+  Test< camp::cartesian_product< OpenMPReducerPolicyList,
+                                 DataTypeList,
+                                 HostResourceList,
+                                 SequentialForoneList > >::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPBasicTest,
+                               ReducerBasicConstructorUnitTest,
+                               OpenMPBasicReducerConstructorTypes);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPInitTest,
+                               ReducerInitConstructorUnitTest,
+                               OpenMPInitReducerConstructorTypes);
+#endif
+
diff --git a/test/unit/reducer/test-reducer-constructors-seq.cpp b/test/unit/reducer/test-reducer-constructors-seq.cpp
new file mode 100644
index 0000000000..a1e1163a3f
--- /dev/null
+++ b/test/unit/reducer/test-reducer-constructors-seq.cpp
@@ -0,0 +1,33 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA reducer constructors and initialization.
+///
+
+#include "tests/test-reducer-constructors.hpp"
+
+using SequentialBasicReducerConstructorTypes = 
+  Test< camp::cartesian_product< SequentialReducerPolicyList,
+                                 DataTypeList,
+                                 HostResourceList > >::Types;
+
+using SequentialInitReducerConstructorTypes = 
+  Test< camp::cartesian_product< SequentialReducerPolicyList,
+                                 DataTypeList,
+                                 HostResourceList,
+                                 SequentialForoneList > >::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(SequentialBasicTest,
+                               ReducerBasicConstructorUnitTest,
+                               SequentialBasicReducerConstructorTypes);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(SequentialInitTest,
+                               ReducerInitConstructorUnitTest,
+                               SequentialInitReducerConstructorTypes);
+
+
diff --git a/test/unit/reducer/test-reducer-constructors-tbb.cpp b/test/unit/reducer/test-reducer-constructors-tbb.cpp
new file mode 100644
index 0000000000..11e05ad4a8
--- /dev/null
+++ b/test/unit/reducer/test-reducer-constructors-tbb.cpp
@@ -0,0 +1,34 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA reducer constructors and initialization.
+///
+
+#include "tests/test-reducer-constructors.hpp"
+
+#if defined(RAJA_ENABLE_TBB)
+using TBBBasicReducerConstructorTypes = 
+  Test< camp::cartesian_product< TBBReducerPolicyList,
+                                 DataTypeList,
+                                 HostResourceList > >::Types;
+
+using TBBInitReducerConstructorTypes = 
+  Test< camp::cartesian_product< TBBReducerPolicyList,
+                                 DataTypeList,
+                                 HostResourceList,
+                                 SequentialForoneList > >::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(TBBBasicTest,
+                               ReducerBasicConstructorUnitTest,
+                               TBBBasicReducerConstructorTypes);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(TBBInitTest,
+                               ReducerInitConstructorUnitTest,
+                               TBBInitReducerConstructorTypes);
+#endif
+
diff --git a/test/unit/reducer/test-reducer-reset-cuda.cpp b/test/unit/reducer/test-reducer-reset-cuda.cpp
new file mode 100644
index 0000000000..b80ae8b504
--- /dev/null
+++ b/test/unit/reducer/test-reducer-reset-cuda.cpp
@@ -0,0 +1,25 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA reducer reset.
+///
+
+#include "tests/test-reducer-reset.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+using CudaReducerResetTypes = 
+  Test< camp::cartesian_product< CudaReducerPolicyList,
+                                 DataTypeList,
+                                 CudaResourceList,
+                                 CudaForoneList > >::Types;
+
+
+INSTANTIATE_TYPED_TEST_SUITE_P(CudaResetTest,
+                               ReducerResetUnitTest,
+                               CudaReducerResetTypes);
+#endif
diff --git a/test/unit/reducer/test-reducer-reset-hip.cpp b/test/unit/reducer/test-reducer-reset-hip.cpp
new file mode 100644
index 0000000000..967d12190f
--- /dev/null
+++ b/test/unit/reducer/test-reducer-reset-hip.cpp
@@ -0,0 +1,25 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA reducer reset.
+///
+
+#include "tests/test-reducer-reset.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+using HipReducerResetTypes = 
+  Test< camp::cartesian_product< HipReducerPolicyList,
+                                 DataTypeList,
+                                 HipResourceList,
+                                 HipForoneList > >::Types;
+
+
+INSTANTIATE_TYPED_TEST_SUITE_P(HipResetTest,
+                               ReducerResetUnitTest,
+                               HipReducerResetTypes);
+#endif
diff --git a/test/unit/reducer/test-reducer-reset-openmp-target.cpp b/test/unit/reducer/test-reducer-reset-openmp-target.cpp
new file mode 100644
index 0000000000..40d4d9b3c3
--- /dev/null
+++ b/test/unit/reducer/test-reducer-reset-openmp-target.cpp
@@ -0,0 +1,25 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA reducer reset.
+///
+
+#include "tests/test-reducer-reset.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+using OpenMPTargetReducerResetTypes = 
+  Test< camp::cartesian_product< OpenMPTargetReducerPolicyList,
+                                 DataTypeList,
+                                 OpenMPTargetResourceList,
+                                 SequentialForoneList > >::Types;
+
+
+INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPTargetResetTest,
+                               ReducerResetUnitTest,
+                               OpenMPTargetReducerResetTypes);
+#endif
diff --git a/test/unit/reducer/test-reducer-reset-openmp.cpp b/test/unit/reducer/test-reducer-reset-openmp.cpp
new file mode 100644
index 0000000000..a51d75bce7
--- /dev/null
+++ b/test/unit/reducer/test-reducer-reset-openmp.cpp
@@ -0,0 +1,25 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA reducer reset.
+///
+
+#include "tests/test-reducer-reset.hpp"
+
+#if defined(RAJA_ENABLE_OPENMP)
+using OpenMPReducerResetTypes = 
+  Test< camp::cartesian_product< OpenMPReducerPolicyList,
+                                 DataTypeList,
+                                 HostResourceList,
+                                 SequentialForoneList > >::Types;
+
+
+INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPResetTest,
+                               ReducerResetUnitTest,
+                               OpenMPReducerResetTypes);
+#endif
diff --git a/test/unit/reducer/test-reducer-reset-seq.cpp b/test/unit/reducer/test-reducer-reset-seq.cpp
new file mode 100644
index 0000000000..99ed0db0f8
--- /dev/null
+++ b/test/unit/reducer/test-reducer-reset-seq.cpp
@@ -0,0 +1,24 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA reducer reset.
+///
+
+#include "tests/test-reducer-reset.hpp"
+
+using SequentialReducerResetTypes = 
+  Test< camp::cartesian_product< SequentialReducerPolicyList,
+                                 DataTypeList,
+                                 HostResourceList,
+                                 SequentialForoneList > >::Types;
+
+
+INSTANTIATE_TYPED_TEST_SUITE_P(SequentialResetTest,
+                               ReducerResetUnitTest,
+                               SequentialReducerResetTypes);
+
diff --git a/test/unit/reducer/test-reducer-reset-tbb.cpp b/test/unit/reducer/test-reducer-reset-tbb.cpp
new file mode 100644
index 0000000000..f242c645ab
--- /dev/null
+++ b/test/unit/reducer/test-reducer-reset-tbb.cpp
@@ -0,0 +1,25 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA reducer reset.
+///
+
+#include "tests/test-reducer-reset.hpp"
+
+#if defined(RAJA_ENABLE_TBB)
+using TBBReducerResetTypes = 
+  Test< camp::cartesian_product< TBBReducerPolicyList,
+                                 DataTypeList,
+                                 HostResourceList,
+                                 SequentialForoneList > >::Types;
+
+
+INSTANTIATE_TYPED_TEST_SUITE_P(TBBResetTest,
+                               ReducerResetUnitTest,
+                               TBBReducerResetTypes);
+#endif
diff --git a/test/unit/reducer/test-reducer.hpp b/test/unit/reducer/test-reducer.hpp
new file mode 100644
index 0000000000..ba6414d2a8
--- /dev/null
+++ b/test/unit/reducer/test-reducer.hpp
@@ -0,0 +1,46 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_REDUCER_UTILS_HPP__
+#define __TEST_REDUCER_UTILS_HPP__
+
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+
+#include "RAJA_unit-test-forone.hpp"
+
+//
+// Data types
+//
+using DataTypeList = camp::list< int,
+                                 float,
+                                 double >;
+
+using SequentialReducerPolicyList = camp::list< RAJA::seq_reduce >;
+
+#if defined(RAJA_ENABLE_TBB)
+using TBBReducerPolicyList = camp::list< RAJA::tbb_reduce >;
+#endif
+
+#if defined(RAJA_ENABLE_OPENMP)
+using OpenMPReducerPolicyList = camp::list< RAJA::omp_reduce,
+                                            RAJA::omp_reduce_ordered >;
+#endif
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+using OpenMPTargetReducerPolicyList = camp::list< RAJA::omp_target_reduce >;
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+using CudaReducerPolicyList = camp::list< RAJA::cuda_reduce >;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+using HipReducerPolicyList = camp::list< RAJA::hip_reduce >;
+#endif
+
+#endif  // __TEST_REDUCER_UTILS_HPP__
diff --git a/test/unit/reducer/tests/test-reducer-constructors.hpp b/test/unit/reducer/tests/test-reducer-constructors.hpp
new file mode 100644
index 0000000000..13e5747d98
--- /dev/null
+++ b/test/unit/reducer/tests/test-reducer-constructors.hpp
@@ -0,0 +1,182 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing tests for RAJA reducer constructors and initialization.
+///
+
+#ifndef __TEST_REDUCER_CONSTRUCTOR__
+#define __TEST_REDUCER_CONSTRUCTOR__
+
+#include "RAJA/internal/MemUtils_CPU.hpp"
+
+#include "../test-reducer.hpp"
+
+template <typename T>
+class ReducerBasicConstructorUnitTest : public ::testing::Test
+{
+};
+
+template <typename T>
+class ReducerInitConstructorUnitTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_SUITE_P(ReducerBasicConstructorUnitTest);
+TYPED_TEST_SUITE_P(ReducerInitConstructorUnitTest);
+
+template <typename ReducePolicy,
+          typename NumericType>
+void testReducerConstructor()
+{
+  RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum;
+  RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min;
+  RAJA::ReduceMax<ReducePolicy, NumericType> reduce_max;
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType> reduce_minloc;
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc;
+
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_minloctup;
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_maxloctup;
+
+  ASSERT_EQ((NumericType)reduce_sum.get(), NumericType());
+  ASSERT_EQ((NumericType)reduce_min.get(), NumericType());
+  ASSERT_EQ((NumericType)reduce_max.get(), NumericType());
+
+  ASSERT_EQ((NumericType)reduce_minloc.get(), NumericType());
+  ASSERT_EQ((NumericType)reduce_maxloc.get(), NumericType());
+  ASSERT_EQ((RAJA::Index_type)reduce_minloc.getLoc(), RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)reduce_maxloc.getLoc(), RAJA::Index_type());
+
+  ASSERT_EQ((NumericType)reduce_minloctup.get(), NumericType());
+  ASSERT_EQ((NumericType)reduce_maxloctup.get(), NumericType());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())), RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())), RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())), RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())), RAJA::Index_type());
+}
+
+TYPED_TEST_P(ReducerBasicConstructorUnitTest, BasicReducerConstructor)
+{
+  using ReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  testReducerConstructor< ReducePolicy, NumericType >();
+}
+
+template  < typename ReducePolicy,
+            typename NumericType,
+            typename ForOnePol >
+typename  std::enable_if< // Host policy does nothing.
+            std::is_base_of<RunOnHost, ForOnePol>::value
+          >::type
+exec_dispatcher( NumericType * RAJA_UNUSED_ARG(initVal) )
+{
+  // Do nothing for host policies.
+}
+
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+template  < typename ReducePolicy,
+            typename NumericType,
+            typename ForOnePol >
+typename  std::enable_if< // GPU policy fiddles with value.
+            std::is_base_of<RunOnDevice, ForOnePol>::value
+          >::type
+exec_dispatcher( NumericType * initVal )
+{
+  forone<ForOnePol>( [=] __device__ () {
+                        initVal[0] += 1;
+                        initVal[0] -= 1;
+                 });
+}
+#endif
+
+template <typename ReducePolicy,
+          typename NumericType,
+          typename WORKING_RES,
+          typename ForOnePol>
+void testInitReducerConstructor()
+{
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res{camp::resources::Host()};
+
+  NumericType * theVal = nullptr;
+  NumericType * initVal = nullptr;
+
+  initVal = work_res.allocate<NumericType>(1);
+  theVal = host_res.allocate<NumericType>(1);
+
+  initVal[0] = (NumericType)5;
+  theVal[0] = (NumericType)10;
+
+  #if defined(RAJA_ENABLE_CUDA)
+  cudaErrchk(cudaDeviceSynchronize());
+  #endif
+
+  #if defined(RAJA_ENABLE_HIP)
+  hipErrchk(hipDeviceSynchronize());
+  #endif
+
+  RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum(initVal[0]);
+  RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min(initVal[0]);
+  RAJA::ReduceMax<ReducePolicy, NumericType> reduce_max(initVal[0]);
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType> reduce_minloc(initVal[0], 1);
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(initVal[0], 1);
+
+  RAJA::tuple<RAJA::Index_type, RAJA::Index_type> LocTup(1, 1);
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_minloctup(initVal[0], LocTup);
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_maxloctup(initVal[0], LocTup);
+
+  // move a value onto device and fiddle
+  exec_dispatcher < ReducePolicy,
+                    NumericType,
+                    ForOnePol
+                  >
+                  ( initVal );
+
+  // actual host assignment
+  theVal[0] = initVal[0];
+
+  ASSERT_EQ((NumericType)(theVal[0]), (NumericType)(initVal[0]));
+
+  ASSERT_EQ((NumericType)reduce_sum.get(), (NumericType)(initVal[0]));
+  ASSERT_EQ((NumericType)reduce_min.get(), (NumericType)(initVal[0]));
+  ASSERT_EQ((NumericType)reduce_max.get(), (NumericType)(initVal[0]));
+
+  ASSERT_EQ((NumericType)reduce_minloc.get(), (NumericType)(initVal[0]));
+  ASSERT_EQ((NumericType)reduce_maxloc.get(), (NumericType)(initVal[0]));
+  ASSERT_EQ((RAJA::Index_type)reduce_minloc.getLoc(), (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)reduce_maxloc.getLoc(), (RAJA::Index_type)1);
+
+  ASSERT_EQ((NumericType)reduce_minloctup.get(), (NumericType)(initVal[0]));
+  ASSERT_EQ((NumericType)reduce_maxloctup.get(), (NumericType)(initVal[0]));
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())), (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())), (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())), (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())), (RAJA::Index_type)1);
+
+  work_res.deallocate( initVal );
+  host_res.deallocate( theVal );
+}
+
+TYPED_TEST_P(ReducerInitConstructorUnitTest, InitReducerConstructor)
+{
+  using ReduceType = typename camp::at<TypeParam, camp::num<0>>::type;
+  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ResourceType = typename camp::at<TypeParam, camp::num<2>>::type;
+  using ForOneType = typename camp::at<TypeParam, camp::num<3>>::type;
+
+  testInitReducerConstructor< ReduceType, NumericType, ResourceType, ForOneType >();
+}
+
+
+REGISTER_TYPED_TEST_SUITE_P(ReducerBasicConstructorUnitTest,
+                            BasicReducerConstructor);
+
+REGISTER_TYPED_TEST_SUITE_P(ReducerInitConstructorUnitTest,
+                            InitReducerConstructor);
+
+#endif  //__TEST_REDUCER_CONSTRUCTOR__
diff --git a/test/unit/reducer/tests/test-reducer-reset.hpp b/test/unit/reducer/tests/test-reducer-reset.hpp
new file mode 100644
index 0000000000..9ef0cfe538
--- /dev/null
+++ b/test/unit/reducer/tests/test-reducer-reset.hpp
@@ -0,0 +1,186 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing tests for RAJA reducer reset.
+///
+
+#ifndef __TEST_REDUCER_RESET__
+#define __TEST_REDUCER_RESET__
+
+#include "RAJA/internal/MemUtils_CPU.hpp"
+
+#include "../test-reducer.hpp"
+
+template  < typename ReducePolicy,
+            typename NumericType,
+            typename Indexer,
+            typename Tuple,
+            typename ForOnePol
+          >
+typename  std::enable_if< // Empty function for non-device policy.
+            std::is_base_of<RunOnHost, ForOnePol>::value
+          >::type
+exec_dispatcher(  RAJA::ReduceSum<ReducePolicy, NumericType> & RAJA_UNUSED_ARG(reduce_sum),
+                  RAJA::ReduceMin<ReducePolicy, NumericType> & RAJA_UNUSED_ARG(reduce_min),
+                  RAJA::ReduceMax<ReducePolicy, NumericType> & RAJA_UNUSED_ARG(reduce_max),
+                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer> & RAJA_UNUSED_ARG(reduce_minloc),
+                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer> & RAJA_UNUSED_ARG(reduce_maxloc),
+                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple> & RAJA_UNUSED_ARG(reduce_minloctup),
+                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple> & RAJA_UNUSED_ARG(reduce_maxloctup),
+                  NumericType RAJA_UNUSED_ARG(initVal)
+               )
+{
+  // Non-device policies should do nothing.
+}
+
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+template  < typename ReducePolicy,
+            typename NumericType,
+            typename Indexer,
+            typename Tuple,
+            typename ForOnePol
+          >
+typename  std::enable_if< // GPU policy execution.
+            std::is_base_of<RunOnDevice, ForOnePol>::value
+          >::type
+exec_dispatcher(  RAJA::ReduceSum<ReducePolicy, NumericType> & reduce_sum,
+                  RAJA::ReduceMin<ReducePolicy, NumericType> & reduce_min,
+                  RAJA::ReduceMax<ReducePolicy, NumericType> & reduce_max,
+                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer> & reduce_minloc,
+                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer> & reduce_maxloc,
+                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple> & reduce_minloctup,
+                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple> & reduce_maxloctup,
+                  NumericType initVal
+               )
+{
+  // Use device to activate any value for each reducer.
+  forone<ForOnePol>( [=] __host__ __device__ () {
+                    Tuple temploc(0,0);
+                    reduce_sum += initVal;
+                    reduce_min.min(0);
+                    reduce_max.max(0);
+                    reduce_minloc.minloc(0,0);
+                    reduce_maxloc.maxloc(0,0);
+                    reduce_minloctup.minloc(0,temploc);
+                    reduce_maxloctup.maxloc(0,temploc);
+                 });
+  // Relying on implicit device synchronization in forone.
+}
+#endif
+
+template <typename T>
+class ReducerResetUnitTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_SUITE_P(ReducerResetUnitTest);
+
+template <  typename ReducePolicy,
+            typename NumericType,
+            typename WORKING_RES,
+            typename ForOnePol  >
+void testReducerReset()
+{
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res{camp::resources::Host()};
+
+  NumericType * resetVal = nullptr;
+  NumericType * initVal = nullptr;
+
+  initVal = work_res.allocate<NumericType>(1);
+  resetVal = host_res.allocate<NumericType>(1);
+
+  initVal[0] = (NumericType)5;
+  resetVal[0] = (NumericType)10;
+
+  #if defined(RAJA_ENABLE_CUDA)
+  cudaErrchk(cudaDeviceSynchronize());
+  #endif
+
+  #if defined(RAJA_ENABLE_HIP)
+  hipErrchk(hipDeviceSynchronize());
+  #endif
+
+  RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum(initVal[0]);
+  RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min(initVal[0]);
+  RAJA::ReduceMax<ReducePolicy, NumericType> reduce_max(initVal[0]);
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType> reduce_minloc(initVal[0], 1);
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(initVal[0], 1);
+
+  RAJA::tuple<RAJA::Index_type, RAJA::Index_type> LocTup(1, 1);
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_minloctup(initVal[0], LocTup);
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_maxloctup(initVal[0], LocTup);
+
+  // initiate some device computation if using device policy
+  exec_dispatcher < ReducePolicy,
+                    NumericType,
+                    RAJA::Index_type,
+                    RAJA::tuple<RAJA::Index_type, RAJA::Index_type>,
+                    ForOnePol
+                  >
+                 (  reduce_sum,
+                    reduce_min,
+                    reduce_max,
+                    reduce_minloc,
+                    reduce_maxloc,
+                    reduce_minloctup,
+                    reduce_maxloctup,
+                    initVal[0]
+                 );
+
+  // perform real host resets
+  reduce_sum.reset(resetVal[0]);
+  reduce_min.reset(resetVal[0]);
+  reduce_max.reset(resetVal[0]);
+  reduce_minloc.reset(resetVal[0]);
+  reduce_maxloc.reset(resetVal[0]);
+  reduce_minloctup.reset(resetVal[0]);
+  reduce_maxloctup.reset(resetVal[0]);
+
+  ASSERT_EQ((NumericType)reduce_sum.get(), (NumericType)(resetVal[0]));
+  ASSERT_EQ((NumericType)reduce_min.get(), (NumericType)(resetVal[0]));
+  ASSERT_EQ((NumericType)reduce_max.get(), (NumericType)(resetVal[0]));
+
+  ASSERT_EQ((NumericType)reduce_minloc.get(), (NumericType)(resetVal[0]));
+  ASSERT_EQ((NumericType)reduce_maxloc.get(), (NumericType)(resetVal[0]));
+  ASSERT_EQ((RAJA::Index_type)reduce_minloc.getLoc(), (RAJA::Index_type)(-1));
+  ASSERT_EQ((RAJA::Index_type)reduce_maxloc.getLoc(), (RAJA::Index_type)(-1));
+
+  ASSERT_EQ((NumericType)reduce_minloctup.get(), (NumericType)(resetVal[0]));
+  ASSERT_EQ((NumericType)reduce_maxloctup.get(), (NumericType)(resetVal[0]));
+
+  // Reset of tuple loc defaults to 0
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())), (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())), (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())), (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())), (RAJA::Index_type)0);
+
+  // reset locs to default of -1.
+  reduce_minloc.reset(resetVal[0], -1);
+  reduce_maxloc.reset(resetVal[0], -1);
+
+  ASSERT_EQ((RAJA::Index_type)reduce_minloc.getLoc(), (RAJA::Index_type)(-1));
+  ASSERT_EQ((RAJA::Index_type)reduce_maxloc.getLoc(), (RAJA::Index_type)(-1));
+
+  work_res.deallocate( initVal );
+  host_res.deallocate( resetVal );
+}
+
+TYPED_TEST_P(ReducerResetUnitTest, BasicReset)
+{
+  using ReduceType = typename camp::at<TypeParam, camp::num<0>>::type;
+  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ResourceType = typename camp::at<TypeParam, camp::num<2>>::type;
+  using ForOneType = typename camp::at<TypeParam, camp::num<3>>::type;
+  testReducerReset< ReduceType, NumericType, ResourceType, ForOneType >();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ReducerResetUnitTest,
+                            BasicReset);
+
+#endif  //__TEST_REDUCER_RESET__
diff --git a/test/unit/resource/CMakeLists.txt b/test/unit/resource/CMakeLists.txt
new file mode 100644
index 0000000000..087461d4cd
--- /dev/null
+++ b/test/unit/resource/CMakeLists.txt
@@ -0,0 +1,50 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of test types for generating test files.
+#
+set(TESTTYPES Depends MultiStream)
+
+list(APPEND RESOURCE_BACKENDS Sequential)
+
+if(RAJA_ENABLE_OPENMP)
+  list(APPEND RESOURCE_BACKENDS OpenMP)
+endif()
+
+if(RAJA_ENABLE_TBB)
+  list(APPEND RESOURCE_BACKENDS TBB)
+endif()
+
+if(RAJA_ENABLE_CUDA)
+  list(APPEND RESOURCE_BACKENDS Cuda)
+endif()
+
+if(RAJA_ENABLE_HIP)
+  list(APPEND RESOURCE_BACKENDS Hip)
+endif()
+
+if(RAJA_ENABLE_TARGET_OPENMP)
+  list(APPEND RESOURCE_BACKENDS OpenMPTarget)
+endif()
+
+#
+# Generate tests for each enabled RAJA back-end. 
+# 
+foreach( BACKEND ${RESOURCE_BACKENDS} )
+  foreach( TESTTYPE ${TESTTYPES} )
+    configure_file( test-resource.cpp.in
+                    test-resource-${TESTTYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-resource-${TESTTYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-resource-${TESTTYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-resource-${TESTTYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( TESTTYPES )
diff --git a/test/unit/resource/test-resource.cpp.in b/test/unit/resource/test-resource.cpp.in
new file mode 100644
index 0000000000..515e62a6b9
--- /dev/null
+++ b/test/unit/resource/test-resource.cpp.in
@@ -0,0 +1,39 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-resource-@TESTTYPE@.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@ResourceTypes =
+  Test< camp::cartesian_product<@BACKEND@ResourceList,
+                                @BACKEND@ForallExecPols>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               Resource@TESTTYPE@Test,
+                               @BACKEND@ResourceTypes);
diff --git a/test/unit/resource/tests/test-resource-Depends.hpp b/test/unit/resource/tests/test-resource-Depends.hpp
new file mode 100644
index 0000000000..34ec9c3999
--- /dev/null
+++ b/test/unit/resource/tests/test-resource-Depends.hpp
@@ -0,0 +1,79 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_RESOURCE_DEPENDS_HPP__
+#define __TEST_RESOURCE_DEPENDS_HPP__
+
+#include "RAJA_test-base.hpp"
+
+template <typename WORKING_RES, typename EXEC_POLICY>
+void ResourceDependsTestImpl()
+{
+  constexpr std::size_t ARRAY_SIZE{10000};
+  using namespace RAJA;
+
+  WORKING_RES dev1;
+  WORKING_RES dev2;
+  resources::Host host;
+
+  int* d_array1 = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
+  int* d_array2 = resources::Resource{dev2}.allocate<int>(ARRAY_SIZE);
+  int* h_array  = host.allocate<int>(ARRAY_SIZE);
+
+
+  forall<EXEC_POLICY>(dev1, RangeSegment(0,ARRAY_SIZE),
+    [=] RAJA_HOST_DEVICE (int i) {
+      d_array1[i] = i;
+    }
+  );
+
+  resources::Event e = forall<EXEC_POLICY>(dev2, RangeSegment(0,ARRAY_SIZE),
+    [=] RAJA_HOST_DEVICE (int i) {
+      d_array2[i] = -1;
+    }
+  );
+
+  dev1.wait_for(&e);
+
+  forall<EXEC_POLICY>(dev1, RangeSegment(0,ARRAY_SIZE),
+    [=] RAJA_HOST_DEVICE (int i) {
+      d_array1[i] *= d_array2[i];
+    }
+  );
+
+  dev1.memcpy(h_array, d_array1, sizeof(int) * ARRAY_SIZE);
+
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
+    [=] (int i) {
+      ASSERT_EQ(h_array[i], -i); 
+    }
+  );
+
+  dev1.deallocate(d_array1);
+  dev2.deallocate(d_array2);
+  host.deallocate(h_array);
+  
+}
+
+TYPED_TEST_SUITE_P(ResourceDependsTest);
+template <typename T>
+class ResourceDependsTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ResourceDependsTest, ResourceDepends)
+{
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  ResourceDependsTestImpl<WORKING_RES, EXEC_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ResourceDependsTest,
+                            ResourceDepends);
+
+#endif  // __TEST_RESOURCE_DEPENDS_HPP__
diff --git a/test/unit/resource/tests/test-resource-MultiStream.hpp b/test/unit/resource/tests/test-resource-MultiStream.hpp
new file mode 100644
index 0000000000..cfaf680ffe
--- /dev/null
+++ b/test/unit/resource/tests/test-resource-MultiStream.hpp
@@ -0,0 +1,80 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_RESOURCE_MULTISTREAM_HPP__
+#define __TEST_RESOURCE_MULTISTREAM_HPP__
+
+#include "RAJA_test-base.hpp"
+
+template <typename WORKING_RES, typename EXEC_POLICY>
+void ResourceMultiStreamTestImpl()
+{
+  constexpr std::size_t ARRAY_SIZE{10000};
+  using namespace RAJA;
+
+  WORKING_RES dev1;
+  WORKING_RES dev2;
+  WORKING_RES dev3;
+  resources::Host host;
+
+  int* d_array = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
+  int* h_array  = host.allocate<int>(ARRAY_SIZE);
+
+  resources::Event e1 = forall<EXEC_POLICY>(dev1, RangeSegment(0,ARRAY_SIZE),
+    [=] RAJA_HOST_DEVICE (int i) {
+      if (i % 3 == 0) {
+        d_array[i] = i;
+      }
+  });
+
+  resources::Event e2 = forall<EXEC_POLICY>(dev2, RangeSegment(0,ARRAY_SIZE),
+    [=] RAJA_HOST_DEVICE (int i) {
+      if (i % 3 == 1) {
+        d_array[i] = i;
+      }
+  });
+
+  resources::Event e3 = forall<EXEC_POLICY>(dev2, RangeSegment(0,ARRAY_SIZE),
+    [=] RAJA_HOST_DEVICE (int i) {
+      if (i % 3 == 2) {
+        d_array[i] = i;
+      }
+  });
+
+  dev1.wait_for(&e2);
+  dev1.wait_for(&e3);
+
+  dev1.memcpy(h_array, d_array, sizeof(int) * ARRAY_SIZE);
+
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
+    [=] (int i) {
+      ASSERT_EQ(h_array[i], i); 
+    }
+  );
+
+  dev1.deallocate(d_array);
+  host.deallocate(h_array);
+}
+
+TYPED_TEST_SUITE_P(ResourceMultiStreamTest);
+template <typename T>
+class ResourceMultiStreamTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ResourceMultiStreamTest, ResourceMultiStream)
+{
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  ResourceMultiStreamTestImpl<WORKING_RES, EXEC_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ResourceMultiStreamTest,
+                            ResourceMultiStream);
+
+#endif  // __TEST_RESOURCE_MULTISTREAM_HPP__
diff --git a/test/unit/test-atomic-ref-auto.cpp b/test/unit/test-atomic-ref-auto.cpp
deleted file mode 100644
index 88bb77f354..0000000000
--- a/test/unit/test-atomic-ref-auto.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for atomic operations
-///
-
-#include "test-atomic-ref.hpp"
-
-#if defined(RAJA_ENABLE_OPENMP)
-
-TEST(Atomic, basic_OpenMP_AtomicRef)
-{
-  testAtomicRefPol<RAJA::omp_for_exec, RAJA::auto_atomic>();
-}
-
-#endif
-
-#if defined(RAJA_ENABLE_CUDA)
-
-GPU_TEST(Atomic, basic_CUDA_AtomicRef)
-{
-  testAtomicRefPol<RAJA::cuda_exec<256>, RAJA::auto_atomic>();
-}
-
-#endif
-
-#if defined(RAJA_ENABLE_HIP)
-
-GPU_TEST(Atomic, basic_HIP_AtomicRef)
-{
-  testAtomicRefPol_gpu<RAJA::hip_exec<256>, RAJA::auto_atomic>();
-}
-
-#endif
-
-TEST(Atomic, basic_seq_AtomicRef)
-{
-  testAtomicRefPol<RAJA::seq_exec, RAJA::auto_atomic>();
-}
-
diff --git a/test/unit/test-atomic-ref.cpp b/test/unit/test-atomic-ref.cpp
deleted file mode 100644
index 10e331f1fb..0000000000
--- a/test/unit/test-atomic-ref.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for atomic operations
-///
-
-#include "test-atomic-ref.hpp"
-
-#if defined(RAJA_ENABLE_OPENMP)
-
-TEST(Atomic, basic_OpenMP_AtomicRef)
-{
-  testAtomicRefPol<RAJA::omp_for_exec, RAJA::omp_atomic>();
-  testAtomicRefPol<RAJA::omp_for_exec, RAJA::builtin_atomic>();
-}
-
-#endif
-
-#if defined(RAJA_ENABLE_CUDA)
-
-GPU_TEST(Atomic, basic_CUDA_AtomicRef)
-{
-  testAtomicRefPol<RAJA::cuda_exec<256>, RAJA::cuda_atomic>();
-}
-
-#endif
-
-#if defined(RAJA_ENABLE_HIP)
-
-GPU_TEST(Atomic, basic_HIP_AtomicRef)
-{
-  testAtomicRefPol_gpu<RAJA::hip_exec<256>, RAJA::hip_atomic>();
-}
-
-#endif
-
-#if defined(TEST_EXHAUSTIVE) || !defined(RAJA_ENABLE_OPENMP)
-TEST(Atomic, basic_seq_AtomicRef)
-{
-  testAtomicRefPol<RAJA::seq_exec, RAJA::seq_atomic>();
-  testAtomicRefPol<RAJA::seq_exec, RAJA::builtin_atomic>();
-}
-#endif
-
diff --git a/test/unit/test-atomic-ref.hpp b/test/unit/test-atomic-ref.hpp
deleted file mode 100644
index 461fe4b8ed..0000000000
--- a/test/unit/test-atomic-ref.hpp
+++ /dev/null
@@ -1,1239 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC.
-//
-// Produced at the Lawrence Livermore National Laboratory
-//
-// LLNL-CODE-689114
-//
-// All rights reserved.
-//
-// This file is part of RAJA.
-//
-// For details about use and distribution, please read RAJA/LICENSE.
-//
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for atomic operations
-///
-
-#include <RAJA/RAJA.hpp>
-#include "RAJA_gtest.hpp"
-#include <type_traits>
-
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 1, T>::type np2m1(T val)
-{
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
-  return val;
-}
-
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 2, T>::type np2m1(T val)
-{
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
-  val |= val >> 8  ;
-  return val;
-}
-
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 4, T>::type np2m1(T val)
-{
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
-  val |= val >> 8  ;
-  val |= val >> 16 ;
-  return val;
-}
-
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 8, T>::type np2m1(T val)
-{
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
-  val |= val >> 8  ;
-  val |= val >> 16 ;
-  val |= val >> 32 ;
-  return val;
-}
-
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 16, T>::type np2m1(T val)
-{
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
-  val |= val >> 8  ;
-  val |= val >> 16 ;
-  val |= val >> 32 ;
-  val |= val >> 64 ;
-  return val;
-}
-
-template < typename T, typename AtomicPolicy >
-struct AndEqOtherOp {
-  AndEqOtherOp(T* count, RAJA::RangeSegment seg)
-    : other(count), min(T(0)), max((T)seg.size()),
-    final_min(min), final_max(min)
-  { count[0] = np2m1((T)seg.size()); }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type i) const
-    { return other &= (T)i; }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct FetchAndOtherOp {
-  FetchAndOtherOp(T* count, RAJA::RangeSegment seg)
-    : other(count), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(min), final_max(min)
-  { count[0] = max; }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type i) const
-    { return other.fetch_and((T)i); }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct OrEqOtherOp {
-  OrEqOtherOp(T* count, RAJA::RangeSegment seg)
-    : other(count), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(max), final_max(max)
-  { count[0] = T(0); }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type i) const
-    { return other |= (T)i; }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct FetchOrOtherOp {
-  FetchOrOtherOp(T* count, RAJA::RangeSegment seg)
-    : other(count), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(max), final_max(max)
-  { count[0] = T(0); }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type i) const
-    { return other.fetch_or((T)i); }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct XorEqOtherOp {
-  XorEqOtherOp(T* count, RAJA::RangeSegment seg)
-    : other(count), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(min), final_max(min)
-  { count[0] = T(0);
-    for (RAJA::Index_type i = 0; i < seg.size(); ++i) {
-      final_min ^= (T)i; final_max ^= (T)i;
-    } }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type i) const
-    { return other ^= (T)i; }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct FetchXorOtherOp {
-  FetchXorOtherOp(T* count, RAJA::RangeSegment seg)
-    : other(count), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(min), final_max(min)
-  { count[0] = T(0);
-    for (RAJA::Index_type i = 0; i < seg.size(); ++i) {
-      final_min ^= (T)i; final_max ^= (T)i;
-    } }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type i) const
-    { return other.fetch_xor((T)i); }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct LoadOtherOp {
-  LoadOtherOp(T* count, RAJA::RangeSegment seg)
-    : other(count), min((T)seg.size()), max(min),
-    final_min(min), final_max(min)
-  { count[0] = min; }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const
-    { return other.load(); }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct OperatorTOtherOp {
-  OperatorTOtherOp(T* count, RAJA::RangeSegment RAJA_UNUSED_ARG(seg))
-    : other(count), min(T(0)), max(min),
-    final_min(min), final_max(min)
-  { count[0] = min; }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const
-    { return other; }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct StoreOtherOp {
-  StoreOtherOp(T* count, RAJA::RangeSegment seg)
-    : other(count), min((T)0), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
-  { count[0] = (T)seg.size(); }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type i) const
-    { other.store((T)i); return (T)i; }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct AssignOtherOp {
-  AssignOtherOp(T* count, RAJA::RangeSegment seg)
-    : other(count), min(T(0)), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
-  { count[0] = (T)seg.size(); }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type i) const
-    { return (other = (T)i); }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct CASOtherOp {
-  CASOtherOp(T* count, RAJA::RangeSegment seg)
-    : other(count), min((T)0), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
-  { count[0] = (T)0; }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type i) const
-    {
-      T received, expect = (T)0;
-      while ((received = other.CAS(expect, (T)i)) != expect) {
-        expect = received;
-      }
-      return received;
-    }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct CompareExchangeWeakOtherOp {
-  CompareExchangeWeakOtherOp(T* count, RAJA::RangeSegment seg)
-    : other(count), min((T)0), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
-  { count[0] = (T)0; }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type i) const
-    {
-      T expect = (T)0;
-      while (!other.compare_exchange_weak(expect, (T)i)) {}
-      return expect;
-    }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct CompareExchangeStrongOtherOp {
-  CompareExchangeStrongOtherOp(T* count, RAJA::RangeSegment seg)
-    : other(count), min((T)0), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
-  { count[0] = (T)0; }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type i) const
-    {
-      T expect = (T)0;
-      while (!other.compare_exchange_strong(expect, (T)i)) {}
-      return expect;
-    }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct PreIncCountOp {
-  PreIncCountOp(T* count, RAJA::RangeSegment seg)
-    : counter(count), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
-  { count[0] = (T)0; }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const {
-      return (++counter) - (T)1;
-    }
-  RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
-};
-
-template < typename T, typename AtomicPolicy >
-struct PostIncCountOp {
-  PostIncCountOp(T* count, RAJA::RangeSegment seg)
-    : counter(count), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
-  { count[0] = (T)0; }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const {
-      return (counter++);
-    }
-  RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
-};
-
-template < typename T, typename AtomicPolicy >
-struct AddEqCountOp {
-  AddEqCountOp(T* count, RAJA::RangeSegment seg)
-    : counter(count), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
-  { count[0] = (T)0; }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const {
-      return (counter += (T)1) - (T)1;
-    }
-  RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
-};
-
-template < typename T, typename AtomicPolicy >
-struct FetchAddCountOp {
-  FetchAddCountOp(T* count, RAJA::RangeSegment seg)
-    : counter(count), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
-  { count[0] = (T)0; }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const {
-      return counter.fetch_add((T)1);
-    }
-  RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
-};
-
-template < typename T, typename AtomicPolicy >
-struct PreDecCountOp {
-  PreDecCountOp(T* count, RAJA::RangeSegment seg)
-    : counter(count), min((T)0), max((T)seg.size()-(T)1), final((T)0)
-  { count[0] = (T)seg.size(); }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const {
-      return (--counter);
-    }
-  RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
-};
-
-template < typename T, typename AtomicPolicy >
-struct PostDecCountOp {
-  PostDecCountOp(T* count, RAJA::RangeSegment seg)
-    : counter(count), min((T)0), max((T)seg.size()-(T)1), final((T)0)
-  { count[0] = (T)seg.size(); }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const {
-      return (counter--) - (T)1;
-    }
-  RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
-};
-
-template < typename T, typename AtomicPolicy >
-struct SubEqCountOp {
-  SubEqCountOp(T* count, RAJA::RangeSegment seg)
-    : counter(count), min((T)0), max((T)seg.size()-(T)1), final((T)0)
-  { count[0] = (T)seg.size(); }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const {
-      return (counter -= (T)1);
-    }
-  RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
-};
-
-template < typename T, typename AtomicPolicy >
-struct FetchSubCountOp {
-  FetchSubCountOp(T* count, RAJA::RangeSegment seg)
-    : counter(count), min((T)0), max((T)seg.size()-(T)1), final((T)0)
-  { count[0] = (T)seg.size(); }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const {
-      return counter.fetch_sub((T)1) - (T)1;
-    }
-  RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
-};
-
-template < typename T, typename AtomicPolicy >
-struct MaxEqOtherOp {
-  MaxEqOtherOp(T* count, RAJA::RangeSegment seg)
-    : other(count), min(T(0)), max((T)seg.size() - (T)1),
-    final_min(max), final_max(max)
-  { count[0] = (T)0; }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type i) const
-    { return other.max((T)i); }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct FetchMaxOtherOp {
-  FetchMaxOtherOp(T* count, RAJA::RangeSegment seg)
-    : other(count), min(T(0)), max((T)seg.size() - (T)1),
-    final_min(max), final_max(max)
-  { count[0] = (T)0; }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type i) const
-    { return other.fetch_max((T)i); }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct MinEqOtherOp {
-  MinEqOtherOp(T* count, RAJA::RangeSegment seg)
-    : other(count), min(T(0)), max((T)seg.size() - (T)1),
-    final_min(min), final_max(min)
-  { count[0] = (T)seg.size(); }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type i) const
-    { return other.min((T)i); }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct FetchMinOtherOp {
-  FetchMinOtherOp(T* count, RAJA::RangeSegment seg)
-    : other(count), min(T(0)), max((T)seg.size()),
-    final_min(min), final_max(min)
-  { count[0] = (T)seg.size(); }
-  RAJA_HOST_DEVICE
-    T operator()(RAJA::Index_type i) const
-    { return other.fetch_min((T)i); }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template <typename ExecPolicy,
-         typename AtomicPolicy,
-         typename T,
-  template <typename, typename> class CountOp>
-void testAtomicRefCount(RAJA::RangeSegment seg,
-    T* count, T* list, bool* hit)
-{
-  CountOp<T, AtomicPolicy> countop(count, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-      list[i] = countop.max + (T)1;
-      hit[i] = false;
-      });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-      T val = countop(i);
-      list[i] = val;
-      hit[(RAJA::Index_type)val] = true;
-      });
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaDeviceSynchronize());
-#endif
-  EXPECT_EQ(countop.final, count[0]);
-  for (RAJA::Index_type i = 0; i < seg.size(); i++) {
-    EXPECT_LE(countop.min, list[i]);
-    EXPECT_GE(countop.max, list[i]);
-    EXPECT_TRUE(hit[i]);
-  }
-}
-
-template <typename ExecPolicy,
-         typename AtomicPolicy,
-         typename T,
-  template <typename, typename> class OtherOp>
-void testAtomicRefOther(RAJA::RangeSegment seg, T* count, T* list)
-{
-  OtherOp<T, AtomicPolicy> otherop(count, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-      list[i] = otherop.max + (T)1;
-      });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-      T val = otherop(i);
-      list[i] = val;
-      });
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaDeviceSynchronize());
-#endif
-  EXPECT_LE(otherop.final_min, count[0]);
-  EXPECT_GE(otherop.final_max, count[0]);
-  for (RAJA::Index_type i = 0; i < seg.size(); i++) {
-    EXPECT_LE(otherop.min, list[i]);
-    EXPECT_GE(otherop.max, list[i]);
-  }
-}
-
-
-template <typename ExecPolicy,
-         typename AtomicPolicy,
-         typename T,
-  RAJA::Index_type N>
-void testAtomicRefIntegral()
-{
-  RAJA::RangeSegment seg(0, N);
-
-  // initialize an array
-#if defined(RAJA_ENABLE_CUDA)
-  T *count = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&count, sizeof(T) * 1));
-  T *list;
-  cudaErrchk(cudaMallocManaged((void **)&list, sizeof(T) * N));
-  bool *hit;
-  cudaErrchk(cudaMallocManaged((void **)&hit, sizeof(bool) * N));
-  cudaErrchk(cudaDeviceSynchronize());
-#else
-  T *count  = new T[1];
-  T *list   = new T[N];
-  bool *hit = new bool[N];
-#endif
-
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, LoadOtherOp     >(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, OperatorTOtherOp>(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, StoreOtherOp    >(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, AssignOtherOp   >(seg, count, list);
-
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, CASOtherOp                  >(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, CompareExchangeWeakOtherOp  >(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, CompareExchangeStrongOtherOp>(seg, count, list);
-
-  testAtomicRefCount<ExecPolicy, AtomicPolicy, T, PreIncCountOp  >(seg, count, list, hit);
-  testAtomicRefCount<ExecPolicy, AtomicPolicy, T, PostIncCountOp >(seg, count, list, hit);
-  testAtomicRefCount<ExecPolicy, AtomicPolicy, T, AddEqCountOp   >(seg, count, list, hit);
-  testAtomicRefCount<ExecPolicy, AtomicPolicy, T, FetchAddCountOp>(seg, count, list, hit);
-
-  testAtomicRefCount<ExecPolicy, AtomicPolicy, T, PreDecCountOp  >(seg, count, list, hit);
-  testAtomicRefCount<ExecPolicy, AtomicPolicy, T, PostDecCountOp >(seg, count, list, hit);
-  testAtomicRefCount<ExecPolicy, AtomicPolicy, T, SubEqCountOp   >(seg, count, list, hit);
-  testAtomicRefCount<ExecPolicy, AtomicPolicy, T, FetchSubCountOp>(seg, count, list, hit);
-
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, MaxEqOtherOp   >(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, FetchMaxOtherOp>(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, MinEqOtherOp   >(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, FetchMinOtherOp>(seg, count, list);
-
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, AndEqOtherOp   >(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, FetchAndOtherOp>(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, OrEqOtherOp    >(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, FetchOrOtherOp >(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, XorEqOtherOp   >(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, FetchXorOtherOp>(seg, count, list);
-
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaFree(hit));
-  cudaErrchk(cudaFree(list));
-  cudaErrchk(cudaFree(count));
-#else
-  delete[] hit;
-  delete[] list;
-  delete[] count;
-#endif
-}
-
-
-
-template <typename ExecPolicy,
-         typename AtomicPolicy,
-         typename T,
-  RAJA::Index_type N>
-void testAtomicRefFloating()
-{
-  RAJA::RangeSegment seg(0, N);
-
-  // initialize an array
-#if defined(RAJA_ENABLE_CUDA)
-  T *count = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&count, sizeof(T) * 1));
-  T *list;
-  cudaErrchk(cudaMallocManaged((void **)&list, sizeof(T) * N));
-  bool *hit;
-  cudaErrchk(cudaMallocManaged((void **)&hit, sizeof(bool) * N));
-  cudaErrchk(cudaDeviceSynchronize());
-#else
-  T *count  = new T[1];
-  T *list   = new T[N];
-  bool *hit = new bool[N];
-#endif
-
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, LoadOtherOp     >(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, OperatorTOtherOp>(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, StoreOtherOp    >(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, AssignOtherOp   >(seg, count, list);
-
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, CASOtherOp                  >(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, CompareExchangeWeakOtherOp  >(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, CompareExchangeStrongOtherOp>(seg, count, list);
-
-  testAtomicRefCount<ExecPolicy, AtomicPolicy, T, PreIncCountOp  >(seg, count, list, hit);
-  testAtomicRefCount<ExecPolicy, AtomicPolicy, T, PostIncCountOp >(seg, count, list, hit);
-  testAtomicRefCount<ExecPolicy, AtomicPolicy, T, AddEqCountOp   >(seg, count, list, hit);
-  testAtomicRefCount<ExecPolicy, AtomicPolicy, T, FetchAddCountOp>(seg, count, list, hit);
-
-  testAtomicRefCount<ExecPolicy, AtomicPolicy, T, PreDecCountOp  >(seg, count, list, hit);
-  testAtomicRefCount<ExecPolicy, AtomicPolicy, T, PostDecCountOp >(seg, count, list, hit);
-  testAtomicRefCount<ExecPolicy, AtomicPolicy, T, SubEqCountOp   >(seg, count, list, hit);
-  testAtomicRefCount<ExecPolicy, AtomicPolicy, T, FetchSubCountOp>(seg, count, list, hit);
-
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, MaxEqOtherOp   >(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, FetchMaxOtherOp>(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, MinEqOtherOp   >(seg, count, list);
-  testAtomicRefOther<ExecPolicy, AtomicPolicy, T, FetchMinOtherOp>(seg, count, list);
-
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaFree(hit));
-  cudaErrchk(cudaFree(list));
-  cudaErrchk(cudaFree(count));
-#else
-  delete[] hit;
-  delete[] list;
-  delete[] count;
-#endif
-}
-
-
-  template <typename ExecPolicy, typename AtomicPolicy>
-void testAtomicRefPol()
-{
-  testAtomicRefIntegral<ExecPolicy, AtomicPolicy, int, 10000>();
-  #if defined(TEST_EXHAUSTIVE)
-  testAtomicRefIntegral<ExecPolicy, AtomicPolicy, unsigned, 10000>();
-  testAtomicRefIntegral<ExecPolicy, AtomicPolicy, long long, 10000>();
-  testAtomicRefIntegral<ExecPolicy, AtomicPolicy, unsigned long long, 10000>();
-
-  testAtomicRefFloating<ExecPolicy, AtomicPolicy, float, 10000>();
-  #endif
-  testAtomicRefFloating<ExecPolicy, AtomicPolicy, double, 10000>();
-}
-
-#if defined(RAJA_ENABLE_HIP)
-
-template < typename T, typename AtomicPolicy >
-struct PreIncCountOp_gpu {
-  PreIncCountOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : counter(d_count), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
-  {
-    count[0] = (T)0;
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const {
-    return (++counter) - (T)1;
-  }
-  RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
-};
-
-template < typename T, typename AtomicPolicy >
-struct PostIncCountOp_gpu {
-  PostIncCountOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : counter(d_count), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
-  {
-    count[0] = (T)0;
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const {
-    return (counter++);
-  }
-  RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
-};
-
-template < typename T, typename AtomicPolicy >
-struct AddEqCountOp_gpu {
-  AddEqCountOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : counter(d_count), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
-  {
-    count[0] = (T)0;
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const {
-    return (counter += (T)1) - (T)1;
-  }
-  RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
-};
-
-template < typename T, typename AtomicPolicy >
-struct FetchAddCountOp_gpu {
-  FetchAddCountOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : counter(d_count), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
-  {
-    count[0] = (T)0;
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const {
-    return counter.fetch_add((T)1);
-  }
-  RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
-};
-
-template < typename T, typename AtomicPolicy >
-struct PreDecCountOp_gpu {
-  PreDecCountOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : counter(d_count), min((T)0), max((T)seg.size()-(T)1), final((T)0)
-  {
-    count[0] = (T)seg.size();
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const {
-    return (--counter);
-  }
-  RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
-};
-
-template < typename T, typename AtomicPolicy >
-struct PostDecCountOp_gpu {
-  PostDecCountOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : counter(d_count), min((T)0), max((T)seg.size()-(T)1), final((T)0)
-  {
-    count[0] = (T)seg.size();
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const {
-    return (counter--) - (T)1;
-  }
-  RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
-};
-
-template < typename T, typename AtomicPolicy >
-struct SubEqCountOp_gpu {
-  SubEqCountOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : counter(d_count), min((T)0), max((T)seg.size()-(T)1), final((T)0)
-  {
-    count[0] = (T)seg.size();
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const {
-    return (counter -= (T)1);
-  }
-  RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
-};
-
-template < typename T, typename AtomicPolicy >
-struct FetchSubCountOp_gpu {
-  FetchSubCountOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : counter(d_count), min((T)0), max((T)seg.size()-(T)1), final((T)0)
-  {
-    count[0] = (T)seg.size();
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const {
-    return counter.fetch_sub((T)1) - (T)1;
-  }
-  RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
-};
-
-template < typename T, typename AtomicPolicy >
-struct MaxEqOtherOp_gpu {
-  MaxEqOtherOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : other(d_count), min(T(0)), max((T)seg.size() - (T)1),
-      final_min(max), final_max(max)
-  {
-    count[0] = (T)0;
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type i) const
-  { return other.max((T)i); }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct FetchMaxOtherOp_gpu {
-  FetchMaxOtherOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : other(d_count), min(T(0)), max((T)seg.size() - (T)1),
-      final_min(max), final_max(max)
-  {
-    count[0] = (T)0;
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type i) const
-  { return other.fetch_max((T)i); }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct MinEqOtherOp_gpu {
-  MinEqOtherOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : other(d_count), min(T(0)), max((T)seg.size() - (T)1),
-      final_min(min), final_max(min)
-  {
-    count[0] = (T)seg.size();
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type i) const
-  { return other.min((T)i); }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct FetchMinOtherOp_gpu {
-  FetchMinOtherOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : other(d_count), min(T(0)), max((T)seg.size()),
-      final_min(min), final_max(min)
-  {
-    count[0] = (T)seg.size();
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type i) const
-  { return other.fetch_min((T)i); }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-
-
-template < typename T, typename AtomicPolicy >
-struct AndEqOtherOp_gpu {
-  AndEqOtherOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : other(d_count), min(T(0)), max((T)seg.size()),
-      final_min(min), final_max(min)
-  {
-    count[0] = np2m1((T)seg.size());
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type i) const
-  { return other &= (T)i; }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct FetchAndOtherOp_gpu {
-  FetchAndOtherOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : other(d_count), min(T(0)), max(np2m1((T)seg.size())),
-      final_min(min), final_max(min)
-  {
-    count[0] = max;
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type i) const
-  { return other.fetch_and((T)i); }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct OrEqOtherOp_gpu {
-  OrEqOtherOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : other(d_count), min(T(0)), max(np2m1((T)seg.size())),
-      final_min(max), final_max(max)
-  {
-    count[0] = T(0);
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type i) const
-  { return other |= (T)i; }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct FetchOrOtherOp_gpu {
-  FetchOrOtherOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : other(d_count), min(T(0)), max(np2m1((T)seg.size())),
-      final_min(max), final_max(max)
-  {
-    count[0] = T(0);
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type i) const
-  { return other.fetch_or((T)i); }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct XorEqOtherOp_gpu {
-  XorEqOtherOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : other(d_count), min(T(0)), max(np2m1((T)seg.size())),
-      final_min(min), final_max(min)
-  {
-    count[0] = T(0);
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-    for (RAJA::Index_type i = 0; i < seg.size(); ++i) {
-      final_min ^= (T)i; final_max ^= (T)i;
-    }
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type i) const
-  { return other ^= (T)i; }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct FetchXorOtherOp_gpu {
-  FetchXorOtherOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : other(d_count), min(T(0)), max(np2m1((T)seg.size())),
-      final_min(min), final_max(min)
-  {
-    count[0] = T(0);
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-    for (RAJA::Index_type i = 0; i < seg.size(); ++i) {
-      final_min ^= (T)i; final_max ^= (T)i;
-    }
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type i) const
-  { return other.fetch_xor((T)i); }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct LoadOtherOp_gpu {
-  LoadOtherOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : other(d_count), min((T)seg.size()), max(min),
-      final_min(min), final_max(min)
-  {
-    count[0] = min;
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const
-  { return other.load(); }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct OperatorTOtherOp_gpu {
-  OperatorTOtherOp_gpu(T* count, T* d_count, RAJA::RangeSegment RAJA_UNUSED_ARG(seg))
-    : other(d_count), min(T(0)), max(min),
-      final_min(min), final_max(min)
-  {
-    count[0] = min;
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type RAJA_UNUSED_ARG(i)) const
-  { return other; }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct StoreOtherOp_gpu {
-  StoreOtherOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : other(d_count), min((T)0), max((T)seg.size() - (T)1),
-      final_min(min), final_max(max)
-  {
-    count[0] = (T)seg.size();
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type i) const
-  { other.store((T)i); return (T)i; }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct AssignOtherOp_gpu {
-  AssignOtherOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : other(d_count), min(T(0)), max((T)seg.size() - (T)1),
-      final_min(min), final_max(max)
-  {
-    count[0] = (T)seg.size();
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type i) const
-  { return (other = (T)i); }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct CASOtherOp_gpu {
-  CASOtherOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : other(d_count), min((T)0), max((T)seg.size() - (T)1),
-      final_min(min), final_max(max)
-  {
-    count[0] = (T)0;
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type i) const
-  {
-    T received, expect = (T)0;
-    while ((received = other.CAS(expect, (T)i)) != expect) {
-      expect = received;
-    }
-    return received;
-  }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct CompareExchangeWeakOtherOp_gpu {
-  CompareExchangeWeakOtherOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : other(d_count), min((T)0), max((T)seg.size() - (T)1),
-      final_min(min), final_max(max)
-  {
-    count[0] = (T)0;
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type i) const
-  {
-    T expect = (T)0;
-    while (!other.compare_exchange_weak(expect, (T)i)) {}
-    return expect;
-  }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template < typename T, typename AtomicPolicy >
-struct CompareExchangeStrongOtherOp_gpu {
-  CompareExchangeStrongOtherOp_gpu(T* count, T* d_count, RAJA::RangeSegment seg)
-    : other(d_count), min((T)0), max((T)seg.size() - (T)1),
-      final_min(min), final_max(max)
-  {
-    count[0] = (T)0;
-    hipMemcpy(d_count, count, 1*sizeof(T), hipMemcpyHostToDevice);
-  }
-  RAJA_HOST_DEVICE
-  T operator()(RAJA::Index_type i) const
-  {
-    T expect = (T)0;
-    while (!other.compare_exchange_strong(expect, (T)i)) {}
-    return expect;
-  }
-  RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
-};
-
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename T,
-          template <typename, typename> class CountOp>
-void testAtomicRefCount_gpu(RAJA::RangeSegment seg,
-                         T* count, T* d_count, T* list, T* d_list, bool* hit, bool* d_hit)
-{
-  CountOp<T, AtomicPolicy> countop(count, d_count, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-    d_list[i] = countop.max + (T)1;
-    d_hit[i] = false;
-  });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-    T val = countop(i);
-    d_list[i] = val;
-    d_hit[(RAJA::Index_type)val] = true;
-  });
-  hipDeviceSynchronize();
-
-  hipMemcpy(count, d_count, 1*sizeof(T), hipMemcpyDeviceToHost);
-  hipMemcpy(list, d_list, seg.size()*sizeof(T), hipMemcpyDeviceToHost);
-  hipMemcpy(hit, d_hit, seg.size()*sizeof(bool), hipMemcpyDeviceToHost);
-
-  EXPECT_EQ(countop.final, count[0]);
-  for (RAJA::Index_type i = 0; i < seg.size(); i++) {
-    EXPECT_LE(countop.min, list[i]);
-    EXPECT_GE(countop.max, list[i]);
-    EXPECT_TRUE(hit[i]);
-  }
-}
-
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename T,
-          template <typename, typename> class OtherOp>
-void testAtomicRefOther_gpu(RAJA::RangeSegment seg, T* count, T* d_count, T* list, T* d_list)
-{
-  OtherOp<T, AtomicPolicy> otherop(count, d_count, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-    d_list[i] = otherop.max + (T)1;
-  });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-    T val = otherop(i);
-    d_list[i] = val;
-  });
-  hipDeviceSynchronize();
-
-  hipMemcpy(count, d_count, 1*sizeof(T), hipMemcpyDeviceToHost);
-  hipMemcpy(list, d_list, seg.size()*sizeof(T), hipMemcpyDeviceToHost);
-
-  EXPECT_LE(otherop.final_min, count[0]);
-  EXPECT_GE(otherop.final_max, count[0]);
-  for (RAJA::Index_type i = 0; i < seg.size(); i++) {
-    EXPECT_LE(otherop.min, list[i]);
-    EXPECT_GE(otherop.max, list[i]);
-  }
-}
-
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename T,
-          RAJA::Index_type N>
-void testAtomicRefIntegral_gpu()
-{
-  RAJA::RangeSegment seg(0, N);
-
-  // initialize an array
-  T *count  = new T[1];
-  T *list   = new T[N];
-  bool *hit = new bool[N];
-  T *d_count = nullptr;
-  T *d_list = nullptr;
-  bool *d_hit = nullptr;
-  hipMalloc((void **)&d_count, sizeof(T) * 1);
-  hipMalloc((void **)&d_list, sizeof(T) * N);
-  hipMalloc((void **)&d_hit, sizeof(bool) * N);
-
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, LoadOtherOp_gpu     >(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, OperatorTOtherOp_gpu>(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, StoreOtherOp_gpu    >(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, AssignOtherOp_gpu   >(seg, count, d_count, list, d_list);
-
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, CASOtherOp_gpu                  >(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, CompareExchangeWeakOtherOp_gpu  >(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, CompareExchangeStrongOtherOp_gpu>(seg, count, d_count, list, d_list);
-
-  testAtomicRefCount_gpu<ExecPolicy, AtomicPolicy, T, PreIncCountOp_gpu  >(seg, count, d_count, list, d_list, hit, d_hit);
-  testAtomicRefCount_gpu<ExecPolicy, AtomicPolicy, T, PostIncCountOp_gpu >(seg, count, d_count, list, d_list, hit, d_hit);
-  testAtomicRefCount_gpu<ExecPolicy, AtomicPolicy, T, AddEqCountOp_gpu   >(seg, count, d_count, list, d_list, hit, d_hit);
-  testAtomicRefCount_gpu<ExecPolicy, AtomicPolicy, T, FetchAddCountOp_gpu>(seg, count, d_count, list, d_list, hit, d_hit);
-
-  testAtomicRefCount_gpu<ExecPolicy, AtomicPolicy, T, PreDecCountOp_gpu  >(seg, count, d_count, list, d_list, hit, d_hit);
-  testAtomicRefCount_gpu<ExecPolicy, AtomicPolicy, T, PostDecCountOp_gpu >(seg, count, d_count, list, d_list, hit, d_hit);
-  testAtomicRefCount_gpu<ExecPolicy, AtomicPolicy, T, SubEqCountOp_gpu   >(seg, count, d_count, list, d_list, hit, d_hit);
-  testAtomicRefCount_gpu<ExecPolicy, AtomicPolicy, T, FetchSubCountOp_gpu>(seg, count, d_count, list, d_list, hit, d_hit);
-
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, MaxEqOtherOp_gpu   >(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, FetchMaxOtherOp_gpu>(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, MinEqOtherOp_gpu   >(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, FetchMinOtherOp_gpu>(seg, count, d_count, list, d_list);
-
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, AndEqOtherOp_gpu   >(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, FetchAndOtherOp_gpu>(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, OrEqOtherOp_gpu    >(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, FetchOrOtherOp_gpu >(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, XorEqOtherOp_gpu   >(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, FetchXorOtherOp_gpu>(seg, count, d_count, list, d_list);
-
-  hipFree(d_hit);
-  hipFree(d_list);
-  hipFree(d_count);
-  delete[] hit;
-  delete[] list;
-  delete[] count;
-}
-
-
-
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename T,
-          RAJA::Index_type N>
-void testAtomicRefFloating_gpu()
-{
-  RAJA::RangeSegment seg(0, N);
-
-  // initialize an array
-  T *count  = new T[1];
-  T *list   = new T[N];
-  bool *hit = new bool[N];
-  T *d_count = nullptr;
-  T *d_list = nullptr;
-  bool *d_hit = nullptr;
-  hipMalloc((void **)&d_count, sizeof(T) * 1);
-  hipMalloc((void **)&d_list, sizeof(T) * N);
-  hipMalloc((void **)&d_hit, sizeof(bool) * N);
-
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, LoadOtherOp_gpu     >(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, OperatorTOtherOp_gpu>(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, StoreOtherOp_gpu    >(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, AssignOtherOp_gpu   >(seg, count, d_count, list, d_list);
-
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, CASOtherOp_gpu                  >(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, CompareExchangeWeakOtherOp_gpu  >(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, CompareExchangeStrongOtherOp_gpu>(seg, count, d_count, list, d_list);
-
-  testAtomicRefCount_gpu<ExecPolicy, AtomicPolicy, T, PreIncCountOp_gpu  >(seg, count, d_count, list, d_list, hit, d_hit);
-  testAtomicRefCount_gpu<ExecPolicy, AtomicPolicy, T, PostIncCountOp_gpu >(seg, count, d_count, list, d_list, hit, d_hit);
-  testAtomicRefCount_gpu<ExecPolicy, AtomicPolicy, T, AddEqCountOp_gpu   >(seg, count, d_count, list, d_list, hit, d_hit);
-  testAtomicRefCount_gpu<ExecPolicy, AtomicPolicy, T, FetchAddCountOp_gpu>(seg, count, d_count, list, d_list, hit, d_hit);
-
-  testAtomicRefCount_gpu<ExecPolicy, AtomicPolicy, T, PreDecCountOp_gpu  >(seg, count, d_count, list, d_list, hit, d_hit);
-  testAtomicRefCount_gpu<ExecPolicy, AtomicPolicy, T, PostDecCountOp_gpu >(seg, count, d_count, list, d_list, hit, d_hit);
-  testAtomicRefCount_gpu<ExecPolicy, AtomicPolicy, T, SubEqCountOp_gpu   >(seg, count, d_count, list, d_list, hit, d_hit);
-  testAtomicRefCount_gpu<ExecPolicy, AtomicPolicy, T, FetchSubCountOp_gpu>(seg, count, d_count, list, d_list, hit, d_hit);
-
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, MaxEqOtherOp_gpu   >(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, FetchMaxOtherOp_gpu>(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, MinEqOtherOp_gpu   >(seg, count, d_count, list, d_list);
-  testAtomicRefOther_gpu<ExecPolicy, AtomicPolicy, T, FetchMinOtherOp_gpu>(seg, count, d_count, list, d_list);
-
-  hipFree(d_hit);
-  hipFree(d_list);
-  hipFree(d_count);
-  delete[] hit;
-  delete[] list;
-  delete[] count;
-}
-
-template <typename ExecPolicy, typename AtomicPolicy>
-void testAtomicRefPol_gpu()
-{
-  testAtomicRefIntegral_gpu<ExecPolicy, AtomicPolicy, int, 10000>();
-  #if defined(TEST_EXHAUSTIVE)
-  testAtomicRefIntegral_gpu<ExecPolicy, AtomicPolicy, unsigned, 10000>();
-  testAtomicRefIntegral_gpu<ExecPolicy, AtomicPolicy, long long, 10000>();
-  testAtomicRefIntegral_gpu<ExecPolicy, AtomicPolicy, unsigned long long, 10000>();
-
-  testAtomicRefFloating_gpu<ExecPolicy, AtomicPolicy, float, 10000>();
-  #endif
-  testAtomicRefFloating_gpu<ExecPolicy, AtomicPolicy, double, 10000>();
-}
-
-
-
-
-
-#endif //defined(RAJA_ENABLE_HIP)
diff --git a/test/unit/test-atomic.cpp b/test/unit/test-atomic.cpp
deleted file mode 100644
index e50390d012..0000000000
--- a/test/unit/test-atomic.cpp
+++ /dev/null
@@ -1,547 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for atomic operations
-///
-
-#include <RAJA/RAJA.hpp>
-#include "RAJA_gtest.hpp"
-
-
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename T,
-          RAJA::Index_type N>
-void testAtomicFunctionBasic()
-{
-  RAJA::RangeSegment seg(0, N);
-
-// initialize an array
-#if defined(RAJA_ENABLE_CUDA)
-  T *dest = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&dest, sizeof(T) * 8));
-
-  cudaErrchk(cudaDeviceSynchronize());
-
-#else
-  T *dest = new T[8];
-#endif
-
-
-  // use atomic add to reduce the array
-  dest[0] = (T)0;
-  dest[1] = (T)N;
-  dest[2] = (T)N;
-  dest[3] = (T)0;
-  dest[4] = (T)0;
-  dest[5] = (T)0;
-  dest[6] = (T)N + 1;
-  dest[7] = (T)0;
-
-
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-    RAJA::atomicAdd<AtomicPolicy>(dest + 0, (T)1);
-    RAJA::atomicSub<AtomicPolicy>(dest + 1, (T)1);
-
-    RAJA::atomicMin<AtomicPolicy>(dest + 2, (T)i);
-    RAJA::atomicMax<AtomicPolicy>(dest + 3, (T)i);
-    RAJA::atomicInc<AtomicPolicy>(dest + 4);
-    RAJA::atomicInc<AtomicPolicy>(dest + 5, (T)16);
-    RAJA::atomicDec<AtomicPolicy>(dest + 6);
-    RAJA::atomicDec<AtomicPolicy>(dest + 7, (T)16);
-  });
-
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaDeviceSynchronize());
-#endif
-
-  EXPECT_EQ((T)N, dest[0]);
-  EXPECT_EQ((T)0, dest[1]);
-  EXPECT_EQ((T)0, dest[2]);
-  EXPECT_EQ((T)N - 1, dest[3]);
-  EXPECT_EQ((T)N, dest[4]);
-  EXPECT_EQ((T)4, dest[5]);
-  EXPECT_EQ((T)1, dest[6]);
-  EXPECT_EQ((T)13, dest[7]);
-
-
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaFree(dest));
-#else
-  delete[] dest;
-#endif
-}
-
-
-template <typename ExecPolicy, typename AtomicPolicy>
-void testAtomicFunctionPol()
-{
-  testAtomicFunctionBasic<ExecPolicy, AtomicPolicy, int, 10000>();
-  testAtomicFunctionBasic<ExecPolicy, AtomicPolicy, unsigned, 10000>();
-  testAtomicFunctionBasic<ExecPolicy, AtomicPolicy, long long, 10000>();
-  testAtomicFunctionBasic<ExecPolicy,
-                          AtomicPolicy,
-                          unsigned long long,
-                          10000>();
-  testAtomicFunctionBasic<ExecPolicy, AtomicPolicy, float, 10000>();
-  testAtomicFunctionBasic<ExecPolicy, AtomicPolicy, double, 10000>();
-}
-
-
-
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename T,
-          RAJA::Index_type N>
-void testAtomicViewBasic()
-{
-  RAJA::RangeSegment seg(0, N);
-  RAJA::RangeSegment seg_half(0, N / 2);
-
-// initialize an array
-#if defined(RAJA_ENABLE_CUDA)
-  T *source = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&source, sizeof(T) * N));
-
-  T *dest = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&dest, sizeof(T) * N / 2));
-
-  cudaErrchk(cudaDeviceSynchronize());
-#else
-  T *source = new T[N];
-  T *dest = new T[N / 2];
-#endif
-
-  RAJA::forall<RAJA::seq_exec>(seg,
-                               [=](RAJA::Index_type i) { source[i] = (T)1; });
-
-  // use atomic add to reduce the array
-  RAJA::View<T, RAJA::Layout<1>> vec_view(source, N);
-
-  RAJA::View<T, RAJA::Layout<1>> sum_view(dest, N);
-  auto sum_atomic_view = RAJA::make_atomic_view<AtomicPolicy>(sum_view);
-
-
-  // Zero out dest using atomic view
-  RAJA::forall<ExecPolicy>(seg_half, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-    sum_atomic_view(i) = (T)0;
-  });
-
-  // Assign values to dest using atomic view
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-    sum_atomic_view(i / 2) += vec_view(i);
-  });
-
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaDeviceSynchronize());
-#endif
-
-  for (RAJA::Index_type i = 0; i < N / 2; ++i) {
-    EXPECT_EQ((T)2, dest[i]);
-  }
-
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaFree(source));
-  cudaErrchk(cudaFree(dest));
-#else
-  delete[] source;
-  delete[] dest;
-#endif
-}
-
-
-template <typename ExecPolicy, typename AtomicPolicy>
-void testAtomicViewPol()
-{
-  testAtomicViewBasic<ExecPolicy, AtomicPolicy, int, 100000>();
-  testAtomicViewBasic<ExecPolicy, AtomicPolicy, unsigned, 100000>();
-  testAtomicViewBasic<ExecPolicy, AtomicPolicy, long long, 100000>();
-  testAtomicViewBasic<ExecPolicy, AtomicPolicy, unsigned long long, 100000>();
-  testAtomicViewBasic<ExecPolicy, AtomicPolicy, float, 100000>();
-  testAtomicViewBasic<ExecPolicy, AtomicPolicy, double, 100000>();
-}
-
-
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename T,
-          RAJA::Index_type N>
-void testAtomicLogical()
-{
-  RAJA::RangeSegment seg(0, N * 8);
-  RAJA::RangeSegment seg_bytes(0, N);
-
-// initialize an array
-#if defined(RAJA_ENABLE_CUDA)
-  T *dest_and = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&dest_and, sizeof(T) * N));
-
-  T *dest_or = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&dest_or, sizeof(T) * N));
-
-  T *dest_xor = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&dest_xor, sizeof(T) * N));
-
-  cudaErrchk(cudaDeviceSynchronize());
-#else
-  T *dest_and = new T[N];
-  T *dest_or = new T[N];
-  T *dest_xor = new T[N];
-#endif
-
-  RAJA::forall<RAJA::seq_exec>(seg_bytes, [=](RAJA::Index_type i) {
-    dest_and[i] = (T)0;
-    dest_or[i] = (T)0;
-    dest_xor[i] = (T)0;
-  });
-
-
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-    RAJA::Index_type offset = i / 8;
-    RAJA::Index_type bit = i % 8;
-    RAJA::atomicAnd<AtomicPolicy>(dest_and + offset,
-                                          (T)(0xFF ^ (1 << bit)));
-    RAJA::atomicOr<AtomicPolicy>(dest_or + offset, (T)(1 << bit));
-    RAJA::atomicXor<AtomicPolicy>(dest_xor + offset, (T)(1 << bit));
-  });
-
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaDeviceSynchronize());
-#endif
-
-  for (RAJA::Index_type i = 0; i < N; ++i) {
-    EXPECT_EQ((T)0x00, dest_and[i]);
-    EXPECT_EQ((T)0xFF, dest_or[i]);
-    EXPECT_EQ((T)0xFF, dest_xor[i]);
-  }
-
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaFree(dest_and));
-  cudaErrchk(cudaFree(dest_or));
-  cudaErrchk(cudaFree(dest_xor));
-#else
-  delete[] dest_and;
-  delete[] dest_or;
-  delete[] dest_xor;
-#endif
-}
-
-
-template <typename ExecPolicy, typename AtomicPolicy>
-void testAtomicLogicalPol()
-{
-  testAtomicLogical<ExecPolicy, AtomicPolicy, int, 100000>();
-  testAtomicLogical<ExecPolicy, AtomicPolicy, unsigned, 100000>();
-  testAtomicLogical<ExecPolicy, AtomicPolicy, long long, 100000>();
-  testAtomicLogical<ExecPolicy, AtomicPolicy, unsigned long long, 100000>();
-}
-
-#if defined(RAJA_ENABLE_HIP)
-
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename T,
-          RAJA::Index_type N>
-void testAtomicFunctionBasic_gpu()
-{
-  RAJA::RangeSegment seg(0, N);
-
-  // initialize an array
-  T *dest = new T[8];
-  T *d_dest = nullptr;
-  hipMalloc((void **)&d_dest, sizeof(T) * 8);
-
-  // use atomic add to reduce the array
-  dest[0] = (T)0;
-  dest[1] = (T)N;
-  dest[2] = (T)N;
-  dest[3] = (T)0;
-  dest[4] = (T)0;
-  dest[5] = (T)0;
-  dest[6] = (T)N + 1;
-  dest[7] = (T)0;
-
-  hipMemcpy(d_dest, dest, 8*sizeof(T), hipMemcpyHostToDevice);
-
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-    RAJA::atomicAdd<AtomicPolicy>(d_dest + 0, (T)1);
-    RAJA::atomicSub<AtomicPolicy>(d_dest + 1, (T)1);
-
-    RAJA::atomicMin<AtomicPolicy>(d_dest + 2, (T)i);
-    RAJA::atomicMax<AtomicPolicy>(d_dest + 3, (T)i);
-    RAJA::atomicInc<AtomicPolicy>(d_dest + 4);
-    RAJA::atomicInc<AtomicPolicy>(d_dest + 5, (T)16);
-    RAJA::atomicDec<AtomicPolicy>(d_dest + 6);
-    RAJA::atomicDec<AtomicPolicy>(d_dest + 7, (T)16);
-  });
-
-  hipDeviceSynchronize();
-
-  hipMemcpy(dest, d_dest, 8*sizeof(T), hipMemcpyDeviceToHost);
-
-  EXPECT_EQ((T)N, dest[0]);
-  EXPECT_EQ((T)0, dest[1]);
-  EXPECT_EQ((T)0, dest[2]);
-  EXPECT_EQ((T)N - 1, dest[3]);
-  EXPECT_EQ((T)N, dest[4]);
-  EXPECT_EQ((T)4, dest[5]);
-  EXPECT_EQ((T)1, dest[6]);
-  EXPECT_EQ((T)13, dest[7]);
-
-
-  delete[] dest;
-  hipFree(d_dest);
-}
-
-template <typename ExecPolicy, typename AtomicPolicy>
-void testAtomicFunctionPol_gpu()
-{
-  testAtomicFunctionBasic_gpu<ExecPolicy, AtomicPolicy, int, 10000>();
-  testAtomicFunctionBasic_gpu<ExecPolicy, AtomicPolicy, unsigned, 10000>();
-  testAtomicFunctionBasic_gpu<ExecPolicy, AtomicPolicy, long long, 10000>();
-  testAtomicFunctionBasic_gpu<ExecPolicy,
-                          AtomicPolicy,
-                          unsigned long long,
-                          10000>();
-  testAtomicFunctionBasic_gpu<ExecPolicy, AtomicPolicy, float, 10000>();
-  testAtomicFunctionBasic_gpu<ExecPolicy, AtomicPolicy, double, 10000>();
-}
-
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename T,
-          RAJA::Index_type N>
-void testAtomicViewBasic_gpu()
-{
-  RAJA::RangeSegment seg(0, N);
-  RAJA::RangeSegment seg_half(0, N / 2);
-
-// initialize an array
-  T *source = new T[N];
-  T *dest = new T[N / 2];
-  T *d_source = nullptr;
-  T *d_dest = nullptr;
-  hipMalloc((void **)&d_source, sizeof(T) * N);
-  hipMalloc((void **)&d_dest, sizeof(T) * N / 2);
-
-  RAJA::forall<RAJA::seq_exec>(seg,
-                               [=](RAJA::Index_type i) { source[i] = (T)1; });
-
-  hipMemcpy(d_source, source, N*sizeof(T), hipMemcpyHostToDevice);
-
-  // use atomic add to reduce the array
-  RAJA::View<T, RAJA::Layout<1>> vec_view(d_source, N);
-
-  RAJA::View<T, RAJA::Layout<1>> sum_view(d_dest, N);
-  auto sum_atomic_view = RAJA::make_atomic_view<AtomicPolicy>(sum_view);
-
-  // Zero out dest using atomic view
-  RAJA::forall<ExecPolicy>(seg_half, [=] RAJA_HOST_DEVICE (RAJA::Index_type i) {
-    sum_atomic_view(i) = (T)0;
-  });
-
-  // Assign values to dest using atomic view
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-    sum_atomic_view(i / 2) += vec_view(i);
-  });
-
-  hipDeviceSynchronize();
-
-  hipMemcpy(dest, d_dest, (N / 2)*sizeof(T), hipMemcpyDeviceToHost);
-
-  for (RAJA::Index_type i = 0; i < N / 2; ++i) {
-    EXPECT_EQ((T)2, dest[i]);
-  }
-
-  hipFree(d_source);
-  hipFree(d_dest);
-  delete[] source;
-  delete[] dest;
-}
-
-
-template <typename ExecPolicy, typename AtomicPolicy>
-void testAtomicViewPol_gpu()
-{
-  testAtomicViewBasic_gpu<ExecPolicy, AtomicPolicy, int, 100000>();
-  testAtomicViewBasic_gpu<ExecPolicy, AtomicPolicy, unsigned, 100000>();
-  testAtomicViewBasic_gpu<ExecPolicy, AtomicPolicy, long long, 100000>();
-  testAtomicViewBasic_gpu<ExecPolicy, AtomicPolicy, unsigned long long, 100000>();
-  testAtomicViewBasic_gpu<ExecPolicy, AtomicPolicy, float, 100000>();
-  testAtomicViewBasic_gpu<ExecPolicy, AtomicPolicy, double, 100000>();
-}
-
-
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename T,
-          RAJA::Index_type N>
-void testAtomicLogical_gpu()
-{
-  RAJA::RangeSegment seg(0, N * 8);
-  RAJA::RangeSegment seg_bytes(0, N);
-
-// initialize an array
-  T *dest_and = new T[N];
-  T *dest_or = new T[N];
-  T *dest_xor = new T[N];
-
-  T *d_dest_and = nullptr;
-  T *d_dest_or = nullptr;
-  T *d_dest_xor = nullptr;
-  hipMalloc((void **)&d_dest_and, sizeof(T) * N);
-  hipMalloc((void **)&d_dest_or, sizeof(T) * N);
-  hipMalloc((void **)&d_dest_xor, sizeof(T) * N);
-
-  RAJA::forall<RAJA::seq_exec>(seg_bytes, [=](RAJA::Index_type i) {
-    dest_and[i] = (T)0;
-    dest_or[i] = (T)0;
-    dest_xor[i] = (T)0;
-  });
-
-  hipMemcpy(d_dest_and, dest_and, N*sizeof(T), hipMemcpyHostToDevice);
-  hipMemcpy(d_dest_or,  dest_or,  N*sizeof(T), hipMemcpyHostToDevice);
-  hipMemcpy(d_dest_xor, dest_xor, N*sizeof(T), hipMemcpyHostToDevice);
-
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-    RAJA::Index_type offset = i / 8;
-    RAJA::Index_type bit = i % 8;
-    RAJA::atomicAnd<AtomicPolicy>(d_dest_and + offset,
-                                          (T)(0xFF ^ (1 << bit)));
-    RAJA::atomicOr<AtomicPolicy>(d_dest_or + offset, (T)(1 << bit));
-    RAJA::atomicXor<AtomicPolicy>(d_dest_xor + offset, (T)(1 << bit));
-  });
-
-  hipDeviceSynchronize();
-
-  hipMemcpy(dest_and, d_dest_and, N*sizeof(T), hipMemcpyDeviceToHost);
-  hipMemcpy(dest_or,  d_dest_or,  N*sizeof(T), hipMemcpyDeviceToHost);
-  hipMemcpy(dest_xor, d_dest_xor, N*sizeof(T), hipMemcpyDeviceToHost);
-
-  for (RAJA::Index_type i = 0; i < N; ++i) {
-    EXPECT_EQ((T)0x00, dest_and[i]);
-    EXPECT_EQ((T)0xFF, dest_or[i]);
-    EXPECT_EQ((T)0xFF, dest_xor[i]);
-  }
-
-  hipFree(d_dest_and);
-  hipFree(d_dest_or);
-  hipFree(d_dest_xor);
-  delete[] dest_and;
-  delete[] dest_or;
-  delete[] dest_xor;
-}
-
-
-template <typename ExecPolicy, typename AtomicPolicy>
-void testAtomicLogicalPol_gpu()
-{
-  testAtomicLogical_gpu<ExecPolicy, AtomicPolicy, int, 100000>();
-  testAtomicLogical_gpu<ExecPolicy, AtomicPolicy, unsigned, 100000>();
-  testAtomicLogical_gpu<ExecPolicy, AtomicPolicy, long long, 100000>();
-  testAtomicLogical_gpu<ExecPolicy, AtomicPolicy, unsigned long long, 100000>();
-}
-
-#endif //RAJA_ENABLE_HIP
-
-#if defined(RAJA_ENABLE_OPENMP)
-
-TEST(Atomic, basic_OpenMP_AtomicFunction)
-{
-  testAtomicFunctionPol<RAJA::omp_for_exec, RAJA::auto_atomic>();
-  testAtomicFunctionPol<RAJA::omp_for_exec, RAJA::omp_atomic>();
-  testAtomicFunctionPol<RAJA::omp_for_exec, RAJA::builtin_atomic>();
-}
-
-
-TEST(Atomic, basic_OpenMP_AtomicView)
-{
-  testAtomicViewPol<RAJA::omp_for_exec, RAJA::auto_atomic>();
-  testAtomicViewPol<RAJA::omp_for_exec, RAJA::omp_atomic>();
-  testAtomicViewPol<RAJA::omp_for_exec, RAJA::builtin_atomic>();
-}
-
-
-TEST(Atomic, basic_OpenMP_Logical)
-{
-  testAtomicLogicalPol<RAJA::omp_for_exec, RAJA::auto_atomic>();
-  testAtomicLogicalPol<RAJA::omp_for_exec, RAJA::omp_atomic>();
-  testAtomicLogicalPol<RAJA::omp_for_exec, RAJA::builtin_atomic>();
-}
-
-#endif
-
-#if defined(RAJA_ENABLE_CUDA)
-
-GPU_TEST(Atomic, basic_CUDA_AtomicFunction)
-{
-  testAtomicFunctionPol<RAJA::cuda_exec<256>, RAJA::auto_atomic>();
-  testAtomicFunctionPol<RAJA::cuda_exec<256>, RAJA::cuda_atomic>();
-}
-
-GPU_TEST(Atomic, basic_CUDA_AtomicView)
-{
-  testAtomicViewPol<RAJA::cuda_exec<256>, RAJA::auto_atomic>();
-  testAtomicViewPol<RAJA::cuda_exec<256>, RAJA::cuda_atomic>();
-}
-
-
-GPU_TEST(Atomic, basic_CUDA_Logical)
-{
-  testAtomicLogicalPol<RAJA::cuda_exec<256>, RAJA::auto_atomic>();
-  testAtomicLogicalPol<RAJA::cuda_exec<256>, RAJA::cuda_atomic>();
-}
-
-
-#endif
-
-#if defined(RAJA_ENABLE_HIP)
-
-GPU_TEST(Atomic, basic_HIP_AtomicFunction)
-{
-  testAtomicFunctionPol_gpu<RAJA::hip_exec<256>, RAJA::auto_atomic>();
-  testAtomicFunctionPol_gpu<RAJA::hip_exec<256>, RAJA::hip_atomic>();
-}
-
-GPU_TEST(Atomic, basic_HIP_AtomicView)
-{
-  testAtomicViewPol_gpu<RAJA::hip_exec<256>, RAJA::auto_atomic>();
-  testAtomicViewPol_gpu<RAJA::hip_exec<256>, RAJA::hip_atomic>();
-}
-
-
-GPU_TEST(Atomic, basic_HIP_Logical)
-{
-  testAtomicLogicalPol_gpu<RAJA::hip_exec<256>, RAJA::auto_atomic>();
-  testAtomicLogicalPol_gpu<RAJA::hip_exec<256>, RAJA::hip_atomic>();
-}
-
-
-#endif
-
-TEST(Atomic, basic_seq_AtomicFunction)
-{
-  testAtomicFunctionPol<RAJA::seq_exec, RAJA::auto_atomic>();
-  testAtomicFunctionPol<RAJA::seq_exec, RAJA::seq_atomic>();
-  testAtomicFunctionPol<RAJA::seq_exec, RAJA::builtin_atomic>();
-}
-
-TEST(Atomic, basic_seq_AtomicView)
-{
-  testAtomicViewPol<RAJA::seq_exec, RAJA::auto_atomic>();
-  testAtomicViewPol<RAJA::seq_exec, RAJA::seq_atomic>();
-  testAtomicViewPol<RAJA::seq_exec, RAJA::builtin_atomic>();
-}
-
-
-TEST(Atomic, basic_seq_Logical)
-{
-  testAtomicLogicalPol<RAJA::seq_exec, RAJA::auto_atomic>();
-  testAtomicLogicalPol<RAJA::seq_exec, RAJA::seq_atomic>();
-  testAtomicLogicalPol<RAJA::seq_exec, RAJA::builtin_atomic>();
-}
diff --git a/test/unit/test-indexvalue.cpp b/test/unit/test-indexvalue.cpp
deleted file mode 100644
index 80409764ad..0000000000
--- a/test/unit/test-indexvalue.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#include "RAJA/RAJA.hpp"
-#include "RAJA_gtest.hpp"
-
-RAJA_INDEX_VALUE(StrongTypeIndex, "Strong Type")
-
-TEST(IndexValue, Construct)
-{
-  StrongTypeIndex a;
-  ASSERT_EQ(0l, *a);
-  const StrongTypeIndex b(5);
-  ASSERT_EQ(5l, *b);
-  ASSERT_EQ(std::string("Strong Type"), StrongTypeIndex::getName());
-}
-
-TEST(IndexValue, PrePostIncrement)
-{
-  StrongTypeIndex a;
-  ASSERT_EQ(0l, *a++);
-  ASSERT_EQ(1l, *a);
-  ASSERT_EQ(2l, *++a);
-  ASSERT_EQ(2l, *a);
-}
-
-TEST(IndexValue, PrePostDecrement)
-{
-  StrongTypeIndex a(3);
-  ASSERT_EQ(3l, *a--);
-  ASSERT_EQ(2l, *a);
-  ASSERT_EQ(1l, *--a);
-  ASSERT_EQ(1l, *a);
-}
-
-TEST(IndexValue, StrongTypesArith)
-{
-  StrongTypeIndex a(8);
-  StrongTypeIndex b(2);
-
-  ASSERT_EQ(StrongTypeIndex(10), a + b);
-  ASSERT_EQ(StrongTypeIndex(8), a);
-  ASSERT_EQ(StrongTypeIndex(2), b);
-
-  ASSERT_EQ(StrongTypeIndex(6), a - b);
-  ASSERT_EQ(StrongTypeIndex(8), a);
-  ASSERT_EQ(StrongTypeIndex(2), b);
-
-  ASSERT_EQ(StrongTypeIndex(16), a * b);
-  ASSERT_EQ(StrongTypeIndex(8), a);
-  ASSERT_EQ(StrongTypeIndex(2), b);
-
-  ASSERT_EQ(StrongTypeIndex(4), a / b);
-  ASSERT_EQ(StrongTypeIndex(8), a);
-  ASSERT_EQ(StrongTypeIndex(2), b);
-
-  a += b;
-  ASSERT_EQ(StrongTypeIndex(10), a);
-  ASSERT_EQ(StrongTypeIndex(2), b);
-
-  a -= b;
-  ASSERT_EQ(StrongTypeIndex(8), a);
-  ASSERT_EQ(StrongTypeIndex(2), b);
-
-  a *= b;
-  ASSERT_EQ(StrongTypeIndex(16), a);
-  ASSERT_EQ(StrongTypeIndex(2), b);
-
-  a /= b;
-  ASSERT_EQ(StrongTypeIndex(8), a);
-  ASSERT_EQ(StrongTypeIndex(2), b);
-}
-
-TEST(IndexValue, IndexTypeArith)
-{
-  StrongTypeIndex a(8);
-  RAJA::Index_type b(2);
-
-  ASSERT_EQ(StrongTypeIndex(10), a + b);
-  ASSERT_EQ(StrongTypeIndex(8), a);
-  ASSERT_EQ(StrongTypeIndex(2), b);
-
-  ASSERT_EQ(StrongTypeIndex(6), a - b);
-  ASSERT_EQ(StrongTypeIndex(8), a);
-  ASSERT_EQ(RAJA::Index_type(2), b);
-
-  ASSERT_EQ(StrongTypeIndex(16), a * b);
-  ASSERT_EQ(StrongTypeIndex(8), a);
-  ASSERT_EQ(RAJA::Index_type(2), b);
-
-  ASSERT_EQ(StrongTypeIndex(4), a / b);
-  ASSERT_EQ(StrongTypeIndex(8), a);
-  ASSERT_EQ(RAJA::Index_type(2), b);
-
-  a += b;
-  ASSERT_EQ(StrongTypeIndex(10), a);
-  ASSERT_EQ(RAJA::Index_type(2), b);
-
-  a -= b;
-  ASSERT_EQ(StrongTypeIndex(8), a);
-  ASSERT_EQ(RAJA::Index_type(2), b);
-
-  a *= b;
-  ASSERT_EQ(StrongTypeIndex(16), a);
-  ASSERT_EQ(RAJA::Index_type(2), b);
-
-  a /= b;
-  ASSERT_EQ(StrongTypeIndex(8), a);
-  ASSERT_EQ(RAJA::Index_type(2), b);
-}
-
-TEST(IndexValue, StrongTypeCompare)
-{
-  StrongTypeIndex v1(5);
-  StrongTypeIndex v2(6);
-  ASSERT_LT(v1, v2);
-  ASSERT_LE(v1, v2);
-  ASSERT_LE(v1, v1);
-  ASSERT_EQ(v1, v1);
-  ASSERT_EQ(v2, v2);
-  ASSERT_GE(v1, v1);
-  ASSERT_GE(v2, v1);
-  ASSERT_GT(v2, v1);
-  ASSERT_NE(v1, v2);
-}
-
-TEST(IndexValue, IndexTypeCompare)
-{
-  StrongTypeIndex v(5);
-  RAJA::Index_type v_lower(4);
-  RAJA::Index_type v_higher(6);
-  RAJA::Index_type v_same(5);
-  ASSERT_LT(v, v_higher);
-  ASSERT_LE(v, v_higher);
-  ASSERT_LE(v, v_same);
-  ASSERT_EQ(v, v_same);
-  ASSERT_GE(v, v_same);
-  ASSERT_GE(v, v_lower);
-  ASSERT_GT(v, v_lower);
-  ASSERT_NE(v, v_lower);
-  ASSERT_NE(v, v_higher);
-}
diff --git a/test/unit/test-integral-limits.cpp b/test/unit/test-integral-limits.cpp
deleted file mode 100644
index d406298790..0000000000
--- a/test/unit/test-integral-limits.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for numeric limits in RAJA operators
-///
-
-#include "gtest/gtest.h"
-
-#define RAJA_CHECK_LIMITS
-#include "RAJA/util/Operators.hpp"
-
-#include <limits>
-
-template <typename T>
-class IntegralLimitsTest : public ::testing::Test
-{
-};
-
-TYPED_TEST_SUITE_P(IntegralLimitsTest);
-
-TYPED_TEST_P(IntegralLimitsTest, IntegralLimits)
-{
-  ASSERT_EQ(RAJA::operators::limits<TypeParam>::min(),
-            std::numeric_limits<TypeParam>::min());
-  ASSERT_EQ(RAJA::operators::limits<TypeParam>::max(),
-            std::numeric_limits<TypeParam>::max());
-}
-
-REGISTER_TYPED_TEST_SUITE_P(IntegralLimitsTest, IntegralLimits);
-
-using integer_types = ::testing::Types<char,
-                                       unsigned char,
-                                       short,
-                                       unsigned short,
-                                       int,
-                                       unsigned int,
-                                       long,
-                                       unsigned long,
-                                       long int,
-                                       unsigned long int,
-                                       long long,
-                                       unsigned long long>;
-
-INSTANTIATE_TYPED_TEST_SUITE_P(IntegralLimitsTests,
-                              IntegralLimitsTest,
-                              integer_types);
diff --git a/test/unit/test-iterators.cpp b/test/unit/test-iterators.cpp
deleted file mode 100644
index abff496777..0000000000
--- a/test/unit/test-iterators.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for internal RAJA Iterators
-///
-
-#include "RAJA/RAJA.hpp"
-#include "RAJA_gtest.hpp"
-
-TEST(NumericIterator, simple)
-{
-  RAJA::Iterators::numeric_iterator<> i;
-  ASSERT_EQ(0, *i);
-  ++i;
-  ASSERT_EQ(1, *i);
-  --i;
-  ASSERT_EQ(0, *i);
-  ASSERT_EQ(0, *i++);
-  ASSERT_EQ(1, *i);
-  ASSERT_EQ(1, *i--);
-  ASSERT_EQ(0, *i);
-  i += 2;
-  ASSERT_EQ(2, *i);
-  i -= 1;
-  ASSERT_EQ(1, *i);
-  RAJA::Iterators::numeric_iterator<> five(5);
-  i += five;
-  ASSERT_EQ(6, *i);
-  i -= five;
-  ASSERT_EQ(1, *i);
-  RAJA::Iterators::numeric_iterator<> three(3);
-  ASSERT_LE(three, three);
-  ASSERT_LE(three, five);
-  ASSERT_LT(three, five);
-  ASSERT_GE(five, three);
-  ASSERT_GT(five, three);
-  ASSERT_NE(five, three);
-  ASSERT_EQ(three + 2, five);
-  ASSERT_EQ(2 + three, five);
-  ASSERT_EQ(five - 2, three);
-  ASSERT_EQ(8 - five, three);
-}
-
-TEST(StridedNumericIterator, simple)
-{
-  RAJA::Iterators::strided_numeric_iterator<> i(0, 2);
-  ASSERT_EQ(0, *i);
-  ++i;
-  ASSERT_EQ(2, *i);
-  --i;
-  ASSERT_EQ(0, *i);
-  i += 2;
-  ASSERT_EQ(4, *i);
-  i -= 1;
-  ASSERT_EQ(2, *i);
-  RAJA::Iterators::strided_numeric_iterator<> three(3, 2);
-  RAJA::Iterators::strided_numeric_iterator<> five(5, 2);
-  ASSERT_LE(three, three);
-  ASSERT_LE(three, five);
-  ASSERT_LT(three, five);
-  ASSERT_GE(five, three);
-  ASSERT_GT(five, three);
-  ASSERT_NE(five, three);
-  ASSERT_EQ(three + 1, five);
-  ASSERT_EQ(five - 1, three);
-}
diff --git a/test/unit/test-multipolicy.cpp b/test/unit/test-multipolicy.cpp
deleted file mode 100644
index de212447ea..0000000000
--- a/test/unit/test-multipolicy.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for basic multipolicy operation
-///
-
-#include <cstddef>
-#include "gtest/gtest.h"
-
-// Tag type to dispatch to test bodies based on policy selected by multipolicy
-
-// struct mp_test_body;
-
-namespace test_policy
-{
-template <int i>
-struct mp_tag {
-};
-}  // namespace test_policy
-
-// This functor implements different test bodies depending on the mock "policy"
-// selected by multipolicy, asserting the ranges of values that are selected in
-// the MultiPolicy basic test below
-struct mp_test_body {
-  void operator()(test_policy::mp_tag<1> const &, std::size_t size) const
-  {
-    ASSERT_LT(size, std::size_t{100});
-  }
-  void operator()(test_policy::mp_tag<2> const &, std::size_t size) const
-  {
-    ASSERT_GT(size, std::size_t{99});
-  }
-  void operator()(test_policy::mp_tag<3> const &, std::size_t size) const
-  {
-    ASSERT_GT(size, std::size_t{10});
-    ASSERT_LT(size, std::size_t{99});
-  }
-};
-
-namespace test_policy
-{
-// fake forall_impl overload to test multipolicy dispatch
-template <int i, typename Iterable>
-void forall_impl(const mp_tag<i> &p, Iterable &&iter, mp_test_body const &body)
-{
-  body(p, iter.size());
-}
-}  // namespace test_policy
-
-using test_policy::mp_tag;
-
-// NOTE: this *must* be after the above to work
-#include "RAJA/RAJA.hpp"
-
-TEST(MultiPolicy, basic)
-{
-  auto mp = RAJA::make_multi_policy<mp_tag<1>, mp_tag<2>>(
-      [](const RAJA::RangeSegment &r) {
-        if (r.size() < 100) {
-          return 0;
-        } else {
-          return 1;
-        }
-      });
-  RAJA::forall(mp, RAJA::RangeSegment(0, 5), mp_test_body{});
-  RAJA::forall(mp, RAJA::RangeSegment(0, 101), mp_test_body{});
-  // Nest a multipolicy to ensure value-based policies are preserved
-  auto mp2 = RAJA::make_multi_policy(std::make_tuple(mp_tag<3>{}, mp),
-                                     [](const RAJA::RangeSegment &r) {
-                                       if (r.size() > 10 && r.size() < 90) {
-                                         return 0;
-                                       } else {
-                                         return 1;
-                                       }
-                                     });
-  RAJA::forall(mp2, RAJA::RangeSegment(0, 5), mp_test_body{});
-  RAJA::forall(mp2, RAJA::RangeSegment(0, 91), mp_test_body{});
-  RAJA::forall(mp2, RAJA::RangeSegment(0, 50), mp_test_body{});
-}
-
-template <typename Multipolicy, typename Iterable>
-void make_invalid_index_throw(Multipolicy &&mp, Iterable &&iter)
-{
-  RAJA::forall(mp, iter, [](RAJA::Index_type) {});
-}
-
-TEST(MultiPolicy, invalid_index)
-{
-  static constexpr const int limit = 100;
-  RAJA::RangeSegment seg(0, limit);
-  auto mp = RAJA::make_multi_policy<RAJA::seq_exec, RAJA::seq_exec>(
-      [](const RAJA::RangeSegment &r) {
-        if (r.size() < limit / 2) return 0;
-        if (r.size() < limit) return 1;
-        return 2;
-      });
-  ASSERT_THROW(make_invalid_index_throw(mp, seg), std::runtime_error);
-}
diff --git a/test/unit/test-region.cpp b/test/unit/test-region.cpp
deleted file mode 100644
index 590daf65cf..0000000000
--- a/test/unit/test-region.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for atomic operations
-///
-
-#include <RAJA/RAJA.hpp>
-#include "RAJA_gtest.hpp"
-
-
-template <typename RegionPolicy, typename loopPol>
-void testRegion()
-{
-
-  int N = 100;
-  int *A = new int[N];
-
-  for (int i = 0; i < N; ++i) {
-    A[i] = 0;
-  }
-
-  RAJA::region<RegionPolicy>([=]() {
-    RAJA::forall<loopPol>(RAJA::RangeSegment(0, N), [=](int i) { A[i] += 1; });
-
-
-    RAJA::forall<loopPol>(RAJA::RangeSegment(0, N), [=](int i) { A[i] += 1; });
-  });
-
-  for (int i = 0; i < N; ++i) {
-    EXPECT_EQ(A[i], 2);
-  }
-
-  delete[] A;
-}
-
-template <typename ExecPol, typename LoopPol>
-void testRegionPol()
-{
-
-  testRegion<ExecPol, LoopPol>();
-}
-
-TEST(Region, basic_Functions)
-{
-
-  testRegionPol<RAJA::seq_region, RAJA::loop_exec>();
-
-#if defined(RAJA_ENABLE_OPENMP)
-  testRegionPol<RAJA::omp_parallel_region, RAJA::omp_for_exec>();
-#endif
-}
diff --git a/test/unit/test-sharedmem.cpp b/test/unit/test-sharedmem.cpp
deleted file mode 100644
index 51fbdc8a77..0000000000
--- a/test/unit/test-sharedmem.cpp
+++ /dev/null
@@ -1,1737 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#include "RAJA/RAJA.hpp"
-#include "RAJA_gtest.hpp"
-
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-#include <cmath>
-#include <cassert>
-
-#include "camp/camp.hpp"
-#include "camp/concepts.hpp"
-
-#if defined(RAJA_ENABLE_CUDA)
-#include <cuda_runtime.h>
-#endif
-
-using namespace RAJA;
-using namespace RAJA::statement;
-
-//Define tile size ( TILE_DIM x TILE_DIM )
-//Matrix transpose and matrix multiplication
-//are carried out via tiling algorithms
-RAJA_INDEX_VALUE(TX, "TX");
-RAJA_INDEX_VALUE(TY, "TY");
-
-const int TILE_DIM = 16;
-
-template <typename NestedPolicy>
-class TypedLocalMem : public ::testing::Test
-{
-
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-};
-TYPED_TEST_SUITE_P(TypedLocalMem);
-
-GPU_TYPED_TEST_P(TypedLocalMem, Basic)
-{
-  using Pol = at_v<TypeParam, 0>;
-
-  const int DIM = 2;
-  const int N_rows = 144;
-  const int N_cols = 255;
-
-  const int inner_Dim0 = TILE_DIM;
-  const int inner_Dim1 = TILE_DIM;
-
-  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
-
-  double *A, *B;
-#if defined(RAJA_ENABLE_CUDA)
-  size_t Arr_sz = N_rows * N_cols;
-  cudaErrchk(cudaMallocManaged(&A,  sizeof(double) * Arr_sz));
-  cudaErrchk(cudaMallocManaged(&B, sizeof(double)  * Arr_sz));
-#else
-  A  = new double[N_rows * N_cols];
-  B  = new double[N_rows * N_cols];
-#endif
-
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Aview(A, N_rows, N_cols);
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Bview(B, N_rows, N_cols);
-
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col= 0 ; col < N_cols; ++col) {
-      A[col + N_cols*row] = col;
-    }
-  }
-
-  using SharedTile = AtomicTypedLocalArray<RAJA::auto_atomic, double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>, TY, TX>;
-  SharedTile myTile, myTile2;
-
-  const TX TX_TILE_DIM(16);
-  const TY TY_TILE_DIM(16);
-
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0), RAJA::TypedRangeSegment<TY>(0,inner_Dim1),
-                                           RAJA::TypedRangeSegment<TX>(0, outer_Dim0), RAJA::TypedRangeSegment<TY>(0,outer_Dim1)),
-                          RAJA::make_tuple(myTile, myTile2),
-
-  //Load data into shared memory
-  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
-
-    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      myTile(ty,tx)   = Aview(row, col);
-    }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
-
-    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      Bview(row, col) = myTile(ty, tx);
-    }
-
-  });
-
-  //Check result
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
-      ASSERT_FLOAT_EQ(B[col + row*N_cols], A[col + row*N_cols]);
-    }
-  }
-
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaFree(A));
-  cudaErrchk(cudaFree(B));
-#else
-  delete [] A;
-  delete [] B;
-#endif
-}
-
-REGISTER_TYPED_TEST_SUITE_P(TypedLocalMem, Basic);
-
-#if defined(RAJA_ENABLE_HIP)
-template <typename NestedPolicy>
-class TypedLocalMem_gpu : public ::testing::Test
-{
-
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-};
-TYPED_TEST_SUITE_P(TypedLocalMem_gpu);
-
-GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
-{
-  using Pol = at_v<TypeParam, 0>;
-
-  const int DIM = 2;
-  const int N_rows = 144;
-  const int N_cols = 255;
-
-  const int inner_Dim0 = TILE_DIM;
-  const int inner_Dim1 = TILE_DIM;
-
-  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
-
-  double *A, *B;
-  double *d_A, *d_B;
-  size_t Arr_sz = N_rows * N_cols;
-  hipMalloc(&d_A, sizeof(double) * Arr_sz);
-  hipMalloc(&d_B, sizeof(double) * Arr_sz);
-  A  = new double[N_rows * N_cols];
-  B  = new double[N_rows * N_cols];
-
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Aview(A, N_rows, N_cols);
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Bview(B, N_rows, N_cols);
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Aview(d_A, N_rows, N_cols);
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Bview(d_B, N_rows, N_cols);
-
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col= 0 ; col < N_cols; ++col) {
-      A[col + N_cols*row] = col;
-    }
-  }
-
-  hipMemcpy(d_A, A, Arr_sz*sizeof(double), hipMemcpyHostToDevice);
-
-  using SharedTile = TypedLocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>, TY, TX>;
-  SharedTile myTile, myTile2;
-
-  const TX TX_TILE_DIM(16);
-  const TY TY_TILE_DIM(16);
-
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0), RAJA::TypedRangeSegment<TY>(0,inner_Dim1),
-                                           RAJA::TypedRangeSegment<TX>(0, outer_Dim0), RAJA::TypedRangeSegment<TY>(0,outer_Dim1)),
-                          RAJA::make_tuple(myTile, myTile2),
-
-  //Load data into shared memory
-  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
-
-    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      myTile(ty,tx)   = d_Aview(row, col);
-    }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
-
-    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      d_Bview(row, col) = myTile(ty, tx);
-    }
-
-  });
-
-  hipMemcpy(B, d_B, Arr_sz*sizeof(double), hipMemcpyDeviceToHost);
-
-  //Check result
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
-      ASSERT_FLOAT_EQ(B[col + row*N_cols], A[col + row*N_cols]);
-    }
-  }
-
-  hipFree(d_A);
-  hipFree(d_B);
-  delete [] A;
-  delete [] B;
-}
-
-REGISTER_TYPED_TEST_SUITE_P(TypedLocalMem_gpu, Basic);
-#endif //defined(RAJA_ENABLE_HIP)
-
-
-//
-//Matrix transpose example - test all variants
-//
-template <typename NestedPolicy>
-class MatTranspose : public ::testing::Test
-{
-
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-};
-TYPED_TEST_SUITE_P(MatTranspose);
-
-GPU_TYPED_TEST_P(MatTranspose, Basic)
-{
-
-  using Pol = at_v<TypeParam, 0>;
-
-  const int DIM = 2;
-  const int N_rows = 144;
-  const int N_cols = 255;
-  const int TILE_DIM = 16;
-
-  const int inner_Dim0 = TILE_DIM;
-  const int inner_Dim1 = TILE_DIM;
-
-  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
-
-  double *A, *At, *B, *Bt;
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaMallocManaged(&A,  sizeof(double) * N_rows * N_cols));
-  cudaErrchk(cudaMallocManaged(&At, sizeof(double) * N_rows * N_cols));
-  cudaErrchk(cudaMallocManaged(&B,  sizeof(double) * N_rows * N_cols));
-  cudaErrchk(cudaMallocManaged(&Bt, sizeof(double) * N_rows * N_cols));
-#else
-  A  = new double[N_rows * N_cols];
-  At = new double[N_rows * N_cols];
-  B  = new double[N_rows * N_cols];
-  Bt = new double[N_rows * N_cols];
-#endif
-
-  RAJA::View<double, RAJA::Layout<DIM>> Aview(A, N_rows, N_cols);
-  RAJA::View<double, RAJA::Layout<DIM>> Atview(At, N_cols, N_rows);
-
-  RAJA::View<double, RAJA::Layout<DIM>> Bview(B, N_rows, N_cols);
-  RAJA::View<double, RAJA::Layout<DIM>> Btview(Bt, N_cols, N_rows);
-
-
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
-      Aview(row, col) = col;
-      Bview(row, col) = col;
-    }
-  }
-
-
-  using SharedTile = LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>>;
-
-  SharedTile myTile, myTile2;
-
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0,inner_Dim1),
-                                           RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0,outer_Dim1)),
-                          RAJA::make_tuple(myTile, myTile2),
-
-  //Load data into shared memory
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
-
-    int col = bx * TILE_DIM + tx;  // Matrix column index
-    int row = by * TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      myTile(ty,tx)  = Aview(row, col);
-      myTile2(ty,tx) = Bview(row, col);
-    }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
-
-    int col = by * TILE_DIM + tx;  // Transposed matrix column index
-    int row = bx * TILE_DIM + ty;  // Transposed matrix row index
-
-    if(row < N_cols && col < N_rows){
-      Atview(row, col) = myTile(tx,ty);
-      Btview(row, col) = myTile2(tx,ty);
-    }
-
-  });
-
-  //Check result
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
-      ASSERT_FLOAT_EQ(Atview(col,row), col);
-      ASSERT_FLOAT_EQ(Btview(col,row), col);
-    }
-  }
-
-
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaFree(A));
-  cudaErrchk(cudaFree(At));
-  cudaErrchk(cudaFree(B));
-  cudaErrchk(cudaFree(Bt));
-#else
-  delete [] A;
-  delete [] At;
-  delete [] B;
-  delete [] Bt;
-#endif
-}
-
-REGISTER_TYPED_TEST_SUITE_P(MatTranspose, Basic);
-
-#if defined(RAJA_ENABLE_HIP)
-
-template <typename NestedPolicy>
-class MatTranspose_gpu : public ::testing::Test
-{
-
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-};
-TYPED_TEST_SUITE_P(MatTranspose_gpu);
-
-GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
-{
-
-  using Pol = at_v<TypeParam, 0>;
-
-  const int DIM = 2;
-  const int N_rows = 144;
-  const int N_cols = 255;
-  const int TILE_DIM = 16;
-
-  const int inner_Dim0 = TILE_DIM;
-  const int inner_Dim1 = TILE_DIM;
-
-  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
-
-  double *A, *At, *B, *Bt;
-  double *d_A, *d_At, *d_B, *d_Bt;
-  hipMalloc(&d_A,  sizeof(double) * N_rows * N_cols);
-  hipMalloc(&d_At, sizeof(double) * N_rows * N_cols);
-  hipMalloc(&d_B,  sizeof(double) * N_rows * N_cols);
-  hipMalloc(&d_Bt, sizeof(double) * N_rows * N_cols);
-  A  = new double[N_rows * N_cols];
-  At = new double[N_rows * N_cols];
-  B  = new double[N_rows * N_cols];
-  Bt = new double[N_rows * N_cols];
-
-  RAJA::View<double, RAJA::Layout<DIM>> Aview(A, N_rows, N_cols);
-  RAJA::View<double, RAJA::Layout<DIM>> Atview(At, N_cols, N_rows);
-
-  RAJA::View<double, RAJA::Layout<DIM>> Bview(B, N_rows, N_cols);
-  RAJA::View<double, RAJA::Layout<DIM>> Btview(Bt, N_cols, N_rows);
-
-  RAJA::View<double, RAJA::Layout<DIM>> d_Aview(d_A, N_rows, N_cols);
-  RAJA::View<double, RAJA::Layout<DIM>> d_Atview(d_At, N_cols, N_rows);
-
-  RAJA::View<double, RAJA::Layout<DIM>> d_Bview(d_B, N_rows, N_cols);
-  RAJA::View<double, RAJA::Layout<DIM>> d_Btview(d_Bt, N_cols, N_rows);
-
-
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
-      Aview(row, col) = col;
-      Bview(row, col) = col;
-    }
-  }
-
-  hipMemcpy(d_A, A, N_rows * N_cols * sizeof(double), hipMemcpyHostToDevice);
-  hipMemcpy(d_B, B, N_rows * N_cols * sizeof(double), hipMemcpyHostToDevice);
-
-
-  using SharedTile = LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>>;
-
-  SharedTile myTile, myTile2;
-
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0,inner_Dim1),
-                                           RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0,outer_Dim1)),
-                          RAJA::make_tuple(myTile, myTile2),
-
-  //Load data into shared memory
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
-
-    int col = bx * TILE_DIM + tx;  // Matrix column index
-    int row = by * TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      myTile(ty,tx)  = d_Aview(row, col);
-      myTile2(ty,tx) = d_Bview(row, col);
-    }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
-
-    int col = by * TILE_DIM + tx;  // Transposed matrix column index
-    int row = bx * TILE_DIM + ty;  // Transposed matrix row index
-
-    if(row < N_cols && col < N_rows){
-      d_Atview(row, col) = myTile(tx,ty);
-      d_Btview(row, col) = myTile2(tx,ty);
-    }
-
-  });
-
-  hipMemcpy(At, d_At, N_rows * N_cols * sizeof(double), hipMemcpyDeviceToHost);
-  hipMemcpy(Bt, d_Bt, N_rows * N_cols * sizeof(double), hipMemcpyDeviceToHost);
-
-  //Check result
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
-      ASSERT_FLOAT_EQ(Atview(col,row), col);
-      ASSERT_FLOAT_EQ(Btview(col,row), col);
-    }
-  }
-
-
-  hipFree(d_A);
-  hipFree(d_At);
-  hipFree(d_B);
-  hipFree(d_Bt);
-  delete [] A;
-  delete [] At;
-  delete [] B;
-  delete [] Bt;
-}
-
-REGISTER_TYPED_TEST_SUITE_P(MatTranspose_gpu, Basic);
-
-#endif //defined(RAJA_ENABLE_HIP)
-
-using SeqTypes =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::KernelPolicy<
-        RAJA::statement::For<3, RAJA::loop_exec,
-          RAJA::statement::For<2, RAJA::loop_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
-
-              //Load data into shared memory
-              RAJA::statement::For<1, RAJA::loop_exec,
-                RAJA::statement::For<0, RAJA::loop_exec,
-                  RAJA::statement::Lambda<0>
-                                   >
-                                 >,
-
-                //Read data from shared memory
-                RAJA::statement::For<1, RAJA::loop_exec,
-                  RAJA::statement::For<0, RAJA::loop_exec,
-                    RAJA::statement::Lambda<1> > >
-
-              > //close shared memory scope
-            >//for 2
-        >//for 3
-      > //kernel policy
-    > //list
-  >; //types
-INSTANTIATE_TYPED_TEST_SUITE_P(Seq, MatTranspose, SeqTypes);
-INSTANTIATE_TYPED_TEST_SUITE_P(Seq, TypedLocalMem, SeqTypes);
-
-
-#if defined(RAJA_ENABLE_OPENMP)
-using TestTypes =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::For<3, RAJA::loop_exec,
-        RAJA::statement::For<2, RAJA::loop_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
-
-           //Load data into shared memory
-           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                     RAJA::ArgList<0, 1>,
-                                     RAJA::statement::Lambda<0>
-                                     >,
-
-           //Read data from shared memory
-           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                     RAJA::ArgList<0, 1>,
-                                     RAJA::statement::Lambda<1>
-                                     >
-                                 >
-        >//for 2
-       >//for 3
-       > //close policy
-     > //close list
-
-  ,RAJA::list<
-      RAJA::KernelPolicy<
-      RAJA::statement::For<3, RAJA::loop_exec,
-        RAJA::statement::For<2, RAJA::loop_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
-
-           //Load data into shared memory
-            RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
-              RAJA::statement::For<0, RAJA::loop_exec,
-                RAJA::statement::Lambda<0>
-              >
-             >,
-
-           //Read data from shared memory
-            RAJA::statement::For<1, RAJA::loop_exec,
-           RAJA::statement::For<0, RAJA::omp_parallel_for_exec,
-                                RAJA::statement::Lambda<1>
-           >
-          >
-         > //close shared mem window
-        > //2
-       >//3
-     >//close policy
-    > //close list
-  ,RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::For<3, RAJA::omp_parallel_for_exec,
-        RAJA::statement::For<2, RAJA::loop_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
-
-           //Load data into shared memory
-           RAJA::statement::For<1, RAJA::loop_exec,
-              RAJA::statement::For<0, RAJA::loop_exec,
-                RAJA::statement::Lambda<0>
-              >
-             >,
-
-           //Read data from shared memory
-            RAJA::statement::For<1, RAJA::loop_exec,
-              RAJA::statement::For<0, RAJA::loop_exec,
-                RAJA::statement::Lambda<1>
-           >
-          >
-         > //close shared mem window
-        > //2
-       >//3
-      > //close policy list
-     > //close list
-  ,RAJA::list<
-    RAJA::KernelPolicy<
-           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                     RAJA::ArgList<2, 3>,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem,RAJA::ParamList<0,1>,
-
-           //Load data into shared memory
-           RAJA::statement::For<1, RAJA::loop_exec,
-              RAJA::statement::For<0, RAJA::loop_exec,
-                RAJA::statement::Lambda<0>
-              >
-             >,
-
-           //Read data from shared memory
-            RAJA::statement::For<1, RAJA::loop_exec,
-              RAJA::statement::For<0, RAJA::loop_exec,
-                RAJA::statement::Lambda<1>
-           >
-          >
-         > //close shared mem window
-       >//outer collapsed
-      > //close policy list
-     > //close list
-   >;
-
-
-INSTANTIATE_TYPED_TEST_SUITE_P(OpenMP, MatTranspose, TestTypes);
-INSTANTIATE_TYPED_TEST_SUITE_P(OpenMP, TypedLocalMem, TestTypes);
-#endif
-
-#if defined(RAJA_ENABLE_CUDA)
-
-using CUDATypes =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-        RAJA::statement::For<3, RAJA::cuda_block_y_loop,
-          RAJA::statement::For<2, RAJA::cuda_block_x_loop,
-
-            RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<0,1>,
-
-              //Load data into shared memory
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<0> > >,
-              RAJA::statement::CudaSyncThreads,
-
-                //Read data from shared memory
-                RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                  RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                    RAJA::statement::Lambda<1> > >,
-                RAJA::statement::CudaSyncThreads
-              > //close shared memory scope
-            >//for 2
-          >//for 3
-        > //CudaKernel
-      > //kernel policy
-    > //list
-  >; //types
-INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, MatTranspose, CUDATypes);
-INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, TypedLocalMem, CUDATypes);
-
-#endif
-
-#if defined(RAJA_ENABLE_HIP)
-
-using HIPTypes =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-        RAJA::statement::For<3, RAJA::hip_block_y_loop,
-          RAJA::statement::For<2, RAJA::hip_block_x_loop,
-
-            RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<0,1>,
-
-              //Load data into shared memory
-              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                  RAJA::statement::Lambda<0> > >,
-              RAJA::statement::HipSyncThreads,
-
-                //Read data from shared memory
-                RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                  RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                    RAJA::statement::Lambda<1> > >,
-                RAJA::statement::HipSyncThreads
-              > //close shared memory scope
-            >//for 2
-        >//for 3
-        > //HipKernel
-      > //kernel policy
-    > //list
-  >; //types
-INSTANTIATE_TYPED_TEST_SUITE_P(HIP, MatTranspose_gpu, HIPTypes);
-INSTANTIATE_TYPED_TEST_SUITE_P(HIP, TypedLocalMem_gpu, HIPTypes);
-
-#endif
-
-template <typename NestedPolicy>
-class MatMultiply : public ::testing::Test
-{
-  virtual void SetUp(){}
-  virtual void TearDown(){}
-};
-
-TYPED_TEST_SUITE_P(MatMultiply);
-
-GPU_TYPED_TEST_P(MatMultiply, shmem)
-{
-
-  using Tile_size0 = at_v<TypeParam, 0>;
-  using Tile_size1 = at_v<TypeParam, 1>;
-  using Pol = at_v<TypeParam, 2>;
-
-  const int DIM = 2;
-
-  //Matrix A size: N x M
-  //Matrix B size: M x P
-  //Result C size: N x P
-
-  const int N = 150;
-  const int M = 25;
-  const int P = 95;
-
-  const int inner_Dim0 = TILE_DIM;
-  const int inner_Dim1 = TILE_DIM;
-
-  const int windowIter = (M-1)/TILE_DIM+1;
-  const int outer_Dim0 = (P-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N-1)/TILE_DIM+1;
-
-  double *A, *B, *C, *C_sol;
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaMallocManaged(&A,  sizeof(double) * N * M));
-  cudaErrchk(cudaMallocManaged(&B,  sizeof(double) * M * P));
-  cudaErrchk(cudaMallocManaged(&C,  sizeof(double) * N * P));
-  cudaErrchk(cudaMallocManaged(&C_sol,  sizeof(double) * N * P));
-#else
-  A  = new double[N * M];
-  B  = new double[M * P];
-  C  = new double[N * P];
-  C_sol  = new double[N * P];
-#endif
-
-  RAJA::View<double, RAJA::Layout<DIM>> Aview(A, N, M);
-  RAJA::View<double, RAJA::Layout<DIM>> Bview(B, M, P);
-  RAJA::View<double, RAJA::Layout<DIM>> Cview(C, N, P);
-  RAJA::View<double, RAJA::Layout<DIM>> C_solView(C_sol, N, P);
-
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < M; ++col) {
-      Aview(row, col) = col;
-    }
-  }
-
-  for (int row = 0; row < M; ++row) {
-    for (int col = 0; col < P; ++col) {
-      Bview(row, col) = col;
-    }
-  }
-
-  for(int r=0; r<N; ++r){
-    for(int c=0; c<P; ++c){
-      int dot = 0.0;
-      for(int k=0; k<M; ++k){
-        dot += Aview(r,k)*Bview(k,c);
-      }
-      C_solView(r,c) = dot;
-    }
-  }
-
-
-  using Shmem      = RAJA::LocalArray<double, RAJA::PERM_IJ, Tile_size0>;
-  using ThreadPriv = RAJA::LocalArray<double, RAJA::PERM_IJ, Tile_size1>;
-
-  Shmem aShared, bShared; //memory to be shared between threads
-  ThreadPriv pVal; //iteration dependent data
-
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0,inner_Dim1),
-                                           RAJA::RangeSegment(0, windowIter),
-                                           RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0,outer_Dim1)),
-                          RAJA::make_tuple(aShared, bShared, pVal),
-
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int , int , int , Shmem &,  Shmem &, ThreadPriv &pVal) {
-
-   pVal(ty,tx) = 0.0;
-
-  },
-
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int i, int bx, int by, Shmem &aShared,  Shmem &bShared, ThreadPriv &) {
-
-   int row = by * TILE_DIM + ty;  // Matrix row index
-   int col = bx * TILE_DIM + tx;  // Matrix column index
-
-
-   //Load a tile of A
-   if( row < N && ((i*TILE_DIM + tx) < M) ){
-     aShared(ty,tx) = Aview(row, (i*TILE_DIM+tx)); //A[row*M + i*TILE_DIM + tx];
-   }else{
-     aShared(ty,tx) = 0.0;
-   }
-
-   //Load a tile of B
-   if( col < P && ((i*TILE_DIM + ty) < M) ){
-     bShared(ty, tx) = Bview((i*TILE_DIM + ty), col);
-   }else{
-     bShared(ty, tx) = 0.0;
-   }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int , int , int , Shmem &aShared,  Shmem &bShared, ThreadPriv & pVal) {
-
-    //Matrix multiply
-    for(int j=0; j<TILE_DIM; j++){
-      pVal(ty,tx) += aShared(ty,j) * bShared(j, tx);
-    }
-
-  },
-
- //If in range write out
- [=] RAJA_HOST_DEVICE (int tx, int ty, int , int bx, int by, Shmem &, Shmem &, ThreadPriv &pValue) {
-
-   int row = by * TILE_DIM + ty;  // Matrix row index
-   int col = bx * TILE_DIM + tx;  // Matrix column index
-
-   if(row < N && col < P){
-     Cview(row,col) = pValue(ty,tx);
-    }
-
-  });
-
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < P; ++col) {
-      ASSERT_FLOAT_EQ(Cview(row,col), C_solView(row,col));
-    }
-  }
-
-
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaFree(A));
-  cudaErrchk(cudaFree(B));
-  cudaErrchk(cudaFree(C));
-  cudaErrchk(cudaFree(C_sol));
-#else
-  delete [] A;
-  delete [] B;
-  delete [] C;
-  delete [] C_sol;
-#endif
-
-}
-
-REGISTER_TYPED_TEST_SUITE_P(MatMultiply, shmem);
-
-#if defined(RAJA_ENABLE_HIP)
-
-template <typename NestedPolicy>
-class MatMultiply_gpu : public ::testing::Test
-{
-  virtual void SetUp(){}
-  virtual void TearDown(){}
-};
-
-TYPED_TEST_SUITE_P(MatMultiply_gpu);
-
-GPU_TYPED_TEST_P(MatMultiply_gpu, shmem)
-{
-
-  using Tile_size0 = at_v<TypeParam, 0>;
-  using Tile_size1 = at_v<TypeParam, 1>;
-  using Pol = at_v<TypeParam, 2>;
-
-  const int DIM = 2;
-
-  //Matrix A size: N x M
-  //Matrix B size: M x P
-  //Result C size: N x P
-
-  const int N = 150;
-  const int M = 25;
-  const int P = 95;
-
-  const int inner_Dim0 = TILE_DIM;
-  const int inner_Dim1 = TILE_DIM;
-
-  const int windowIter = (M-1)/TILE_DIM+1;
-  const int outer_Dim0 = (P-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N-1)/TILE_DIM+1;
-
-  double *A, *B, *C, *C_sol;
-  double *d_A, *d_B, *d_C, *d_C_sol;
-  hipMalloc(&d_A,  sizeof(double) * N * M);
-  hipMalloc(&d_B,  sizeof(double) * M * P);
-  hipMalloc(&d_C,  sizeof(double) * N * P);
-  hipMalloc(&d_C_sol,  sizeof(double) * N * P);
-  A  = new double[N * M];
-  B  = new double[M * P];
-  C  = new double[N * P];
-  C_sol  = new double[N * P];
-
-  RAJA::View<double, RAJA::Layout<DIM>> Aview(A, N, M);
-  RAJA::View<double, RAJA::Layout<DIM>> Bview(B, M, P);
-  RAJA::View<double, RAJA::Layout<DIM>> Cview(C, N, P);
-  RAJA::View<double, RAJA::Layout<DIM>> C_solView(C_sol, N, P);
-
-  RAJA::View<double, RAJA::Layout<DIM>> d_Aview(d_A, N, M);
-  RAJA::View<double, RAJA::Layout<DIM>> d_Bview(d_B, M, P);
-  RAJA::View<double, RAJA::Layout<DIM>> d_Cview(d_C, N, P);
-  RAJA::View<double, RAJA::Layout<DIM>> d_C_solView(d_C_sol, N, P);
-
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < M; ++col) {
-      Aview(row, col) = col;
-    }
-  }
-
-  for (int row = 0; row < M; ++row) {
-    for (int col = 0; col < P; ++col) {
-      Bview(row, col) = col;
-    }
-  }
-
-  for(int r=0; r<N; ++r){
-    for(int c=0; c<P; ++c){
-      int dot = 0.0;
-      for(int k=0; k<M; ++k){
-        dot += Aview(r,k)*Bview(k,c);
-      }
-      C_solView(r,c) = dot;
-    }
-  }
-
-  hipMemcpy(d_A, A, N * M * sizeof(double), hipMemcpyHostToDevice);
-  hipMemcpy(d_B, B, M * P * sizeof(double), hipMemcpyHostToDevice);
-
-  using Shmem      = RAJA::LocalArray<double, RAJA::PERM_IJ, Tile_size0>;
-  using ThreadPriv = RAJA::LocalArray<double, RAJA::PERM_IJ, Tile_size1>;
-
-  Shmem aShared, bShared; //memory to be shared between threads
-  ThreadPriv pVal; //iteration dependent data
-
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0,inner_Dim1),
-                                           RAJA::RangeSegment(0, windowIter),
-                                           RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0,outer_Dim1)),
-                          RAJA::make_tuple(aShared, bShared, pVal),
-
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int , int , int , Shmem &,  Shmem &, ThreadPriv &pVal) {
-
-   pVal(ty,tx) = 0.0;
-
-  },
-
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int i, int bx, int by, Shmem &aShared,  Shmem &bShared, ThreadPriv &) {
-
-   int row = by * TILE_DIM + ty;  // Matrix row index
-   int col = bx * TILE_DIM + tx;  // Matrix column index
-
-
-   //Load a tile of A
-   if( row < N && ((i*TILE_DIM + tx) < M) ){
-     aShared(ty,tx) = d_Aview(row, (i*TILE_DIM+tx)); //A[row*M + i*TILE_DIM + tx];
-   }else{
-     aShared(ty,tx) = 0.0;
-   }
-
-   //Load a tile of B
-   if( col < P && ((i*TILE_DIM + ty) < M) ){
-     bShared(ty, tx) = d_Bview((i*TILE_DIM + ty), col);
-   }else{
-     bShared(ty, tx) = 0.0;
-   }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int , int , int , Shmem &aShared,  Shmem &bShared, ThreadPriv & pVal) {
-
-    //Matrix multiply
-    for(int j=0; j<TILE_DIM; j++){
-      pVal(ty,tx) += aShared(ty,j) * bShared(j, tx);
-    }
-
-  },
-
- //If in range write out
- [=] RAJA_HOST_DEVICE (int tx, int ty, int , int bx, int by, Shmem &, Shmem &, ThreadPriv &pValue) {
-
-   int row = by * TILE_DIM + ty;  // Matrix row index
-   int col = bx * TILE_DIM + tx;  // Matrix column index
-
-   if(row < N && col < P){
-     d_Cview(row,col) = pValue(ty,tx);
-    }
-
-  });
-
-  hipMemcpy(C, d_C, N * P * sizeof(double), hipMemcpyDeviceToHost);
-
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < P; ++col) {
-      ASSERT_FLOAT_EQ(Cview(row,col), C_solView(row,col));
-    }
-  }
-
-
-  hipFree(d_A);
-  hipFree(d_B);
-  hipFree(d_C);
-  hipFree(d_C_sol);
-  delete [] A;
-  delete [] B;
-  delete [] C;
-  delete [] C_sol;
-}
-
-REGISTER_TYPED_TEST_SUITE_P(MatMultiply_gpu, shmem);
-
-#endif //defined(RAJA_ENABLE_HIP)
-
-//
-//Matrix multiplication with a scalar for accumulating the dot product
-//Illustrates how to go between CPU and GPU code by change order of lambdas
-//
-
-template <typename NestedPolicy>
-class MatMultiplyScalar : public ::testing::Test
-{
-  virtual void SetUp(){}
-  virtual void TearDown(){}
-};
-
-TYPED_TEST_SUITE_P(MatMultiplyScalar);
-
-GPU_TYPED_TEST_P(MatMultiplyScalar, shmem)
-{
-
-  using Tile_size0 = at_v<TypeParam, 0>;
-  using Pol = at_v<TypeParam, 3>;
-
-  const int DIM = 2;
-
-  //Matrix A size: N x M
-  //Matrix B size: M x P
-  //Result C size: N x P
-
-  const int N = 150;
-  const int M = 25;
-  const int P = 95;
-
-  const int inner_Dim0 = TILE_DIM;
-  const int inner_Dim1 = TILE_DIM;
-
-  const int windowIter = (M-1)/TILE_DIM+1;
-  const int outer_Dim0 = (P-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N-1)/TILE_DIM+1;
-
-  double *A, *B, *C, *C_sol;
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaMallocManaged(&A,  sizeof(double) * N * M));
-  cudaErrchk(cudaMallocManaged(&B,  sizeof(double) * M * P));
-  cudaErrchk(cudaMallocManaged(&C,  sizeof(double) * N * P));
-  cudaErrchk(cudaMallocManaged(&C_sol,  sizeof(double) * N * P));
-#else
-  A  = new double[N * M];
-  B  = new double[M * P];
-  C  = new double[N * P];
-  C_sol  = new double[N * P];
-#endif
-
-  RAJA::View<double, RAJA::Layout<DIM>> Aview(A, N, M);
-  RAJA::View<double, RAJA::Layout<DIM>> Bview(B, M, P);
-  RAJA::View<double, RAJA::Layout<DIM>> Cview(C, N, P);
-  RAJA::View<double, RAJA::Layout<DIM>> C_solView(C_sol, N, P);
-
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < M; ++col) {
-      Aview(row, col) = col;
-    }
-  }
-
-  for (int row = 0; row < M; ++row) {
-    for (int col = 0; col < P; ++col) {
-      Bview(row, col) = col;
-    }
-  }
-
-  for(int r=0; r<N; ++r){
-    for(int c=0; c<P; ++c){
-      int dot = 0.0;
-      for(int k=0; k<M; ++k){
-        dot += Aview(r,k)*Bview(k,c);
-      }
-      Cview(r,c) = 0.0;
-      C_solView(r,c) = dot;
-    }
-  }
-
-  using Shmem = RAJA::LocalArray<double, RAJA::PERM_IJ, Tile_size0>;
-
-  Shmem aShared, bShared; //memory to be shared between threads
-
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0,inner_Dim1),
-                                           RAJA::RangeSegment(0, windowIter),
-                                           RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0,outer_Dim1)),
-                          RAJA::make_tuple(aShared, bShared, 0.0),
-
-  [=] RAJA_HOST_DEVICE (int, int, int, int, int, Shmem &,  Shmem &, double & pVal) {
-
-   pVal = 0.0;
-
-  },
-
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int i, int bx, int by, Shmem &aShared,  Shmem &bShared, double &) {
-
-   int row = by * TILE_DIM + ty;  // Matrix row index
-   int col = bx * TILE_DIM + tx;  // Matrix column index
-
-
-   //Load tile for A
-   if( row < N && ((i*TILE_DIM + tx) < M) ){
-     aShared(ty,tx) = Aview(row, (i*TILE_DIM+tx)); //A[row*M + i*TILE_DIM + tx];
-   }else{
-     aShared(ty,tx) = 0.0;
-   }
-
-   //Load tile for B
-   if( col < P && ((i*TILE_DIM + ty) < M) ){
-     bShared(ty, tx) = Bview((i*TILE_DIM + ty), col);
-   }else{
-     bShared(ty, tx) = 0.0;
-   }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int , int , int, Shmem &aShared,  Shmem &bShared, double & pVal) {
-
-    //Matrix multiply
-    for(int j=0; j<TILE_DIM; j++){
-      pVal += aShared(ty,j) * bShared(j, tx);
-    }
-
-  },
-
-  //write out solution
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int , int bx , int by, Shmem &,  Shmem &, double & pVal) {
-
-    int row = by * TILE_DIM + ty;  // Matrix row index
-    int col = bx * TILE_DIM + tx;  // Matrix column index
-    if(row < N && col < P){
-      Cview(row, col) += pVal;
-    }
-
-  });
-
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < P; ++col) {
-      ASSERT_FLOAT_EQ(Cview(row,col), C_solView(row,col));
-    }
-  }
-
-
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaFree(A));
-  cudaErrchk(cudaFree(B));
-  cudaErrchk(cudaFree(C));
-  cudaErrchk(cudaFree(C_sol));
-#else
-  delete [] A;
-  delete [] B;
-  delete [] C;
-  delete [] C_sol;
-#endif
-
-}
-
-REGISTER_TYPED_TEST_SUITE_P(MatMultiplyScalar, shmem);
-
-#if defined(RAJA_ENABLE_HIP)
-
-template <typename NestedPolicy>
-class MatMultiplyScalar_gpu : public ::testing::Test
-{
-  virtual void SetUp(){}
-  virtual void TearDown(){}
-};
-
-TYPED_TEST_SUITE_P(MatMultiplyScalar_gpu);
-
-GPU_TYPED_TEST_P(MatMultiplyScalar_gpu, shmem)
-{
-
-  using Tile_size0 = at_v<TypeParam, 0>;
-  using Pol = at_v<TypeParam, 3>;
-
-  const int DIM = 2;
-
-  //Matrix A size: N x M
-  //Matrix B size: M x P
-  //Result C size: N x P
-
-  const int N = 150;
-  const int M = 25;
-  const int P = 95;
-
-  const int inner_Dim0 = TILE_DIM;
-  const int inner_Dim1 = TILE_DIM;
-
-  const int windowIter = (M-1)/TILE_DIM+1;
-  const int outer_Dim0 = (P-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N-1)/TILE_DIM+1;
-
-  double *A, *B, *C, *C_sol;
-  double *d_A, *d_B, *d_C, *d_C_sol;
-  hipMalloc(&d_A,  sizeof(double) * N * M);
-  hipMalloc(&d_B,  sizeof(double) * M * P);
-  hipMalloc(&d_C,  sizeof(double) * N * P);
-  hipMalloc(&d_C_sol,  sizeof(double) * N * P);
-  A  = new double[N * M];
-  B  = new double[M * P];
-  C  = new double[N * P];
-  C_sol  = new double[N * P];
-
-  RAJA::View<double, RAJA::Layout<DIM>> Aview(A, N, M);
-  RAJA::View<double, RAJA::Layout<DIM>> Bview(B, M, P);
-  RAJA::View<double, RAJA::Layout<DIM>> Cview(C, N, P);
-  RAJA::View<double, RAJA::Layout<DIM>> C_solView(C_sol, N, P);
-
-  RAJA::View<double, RAJA::Layout<DIM>> d_Aview(d_A, N, M);
-  RAJA::View<double, RAJA::Layout<DIM>> d_Bview(d_B, M, P);
-  RAJA::View<double, RAJA::Layout<DIM>> d_Cview(d_C, N, P);
-  RAJA::View<double, RAJA::Layout<DIM>> d_C_solView(d_C_sol, N, P);
-
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < M; ++col) {
-      Aview(row, col) = col;
-    }
-  }
-
-  for (int row = 0; row < M; ++row) {
-    for (int col = 0; col < P; ++col) {
-      Bview(row, col) = col;
-    }
-  }
-
-  for(int r=0; r<N; ++r){
-    for(int c=0; c<P; ++c){
-      int dot = 0.0;
-      for(int k=0; k<M; ++k){
-        dot += Aview(r,k)*Bview(k,c);
-      }
-      Cview(r,c) = 0.0;
-      C_solView(r,c) = dot;
-    }
-  }
-
-  hipMemcpy(d_A, A, N * M * sizeof(double), hipMemcpyHostToDevice);
-  hipMemcpy(d_B, B, M * P * sizeof(double), hipMemcpyHostToDevice);
-  hipMemcpy(d_C, C, N * P * sizeof(double), hipMemcpyHostToDevice);
-
-  using Shmem = RAJA::LocalArray<double, RAJA::PERM_IJ, Tile_size0>;
-
-  Shmem aShared, bShared; //memory to be shared between threads
-
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0,inner_Dim1),
-                                           RAJA::RangeSegment(0, windowIter),
-                                           RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0,outer_Dim1)),
-                          RAJA::make_tuple(aShared, bShared, 0.0),
-
-  [=] RAJA_HOST_DEVICE (int, int, int, int, int, Shmem &,  Shmem &, double & pVal) {
-
-   pVal = 0.0;
-
-  },
-
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int i, int bx, int by, Shmem &aShared,  Shmem &bShared, double &) {
-
-   int row = by * TILE_DIM + ty;  // Matrix row index
-   int col = bx * TILE_DIM + tx;  // Matrix column index
-
-
-   //Load tile for A
-   if( row < N && ((i*TILE_DIM + tx) < M) ){
-     aShared(ty,tx) = d_Aview(row, (i*TILE_DIM+tx)); //A[row*M + i*TILE_DIM + tx];
-   }else{
-     aShared(ty,tx) = 0.0;
-   }
-
-   //Load tile for B
-   if( col < P && ((i*TILE_DIM + ty) < M) ){
-     bShared(ty, tx) = d_Bview((i*TILE_DIM + ty), col);
-   }else{
-     bShared(ty, tx) = 0.0;
-   }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int , int , int, Shmem &aShared,  Shmem &bShared, double & pVal) {
-
-    //Matrix multiply
-    for(int j=0; j<TILE_DIM; j++){
-      pVal += aShared(ty,j) * bShared(j, tx);
-    }
-
-  },
-
-  //write out solution
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int , int bx , int by, Shmem &,  Shmem &, double & pVal) {
-
-    int row = by * TILE_DIM + ty;  // Matrix row index
-    int col = bx * TILE_DIM + tx;  // Matrix column index
-    if(row < N && col < P){
-      d_Cview(row, col) += pVal;
-    }
-
-  });
-
-  hipMemcpy(C, d_C, N * P * sizeof(double), hipMemcpyDeviceToHost);
-
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < P; ++col) {
-      ASSERT_FLOAT_EQ(Cview(row,col), C_solView(row,col));
-    }
-  }
-
-
-  hipFree(d_A);
-  hipFree(d_B);
-  hipFree(d_C);
-  hipFree(d_C_sol);
-  delete [] A;
-  delete [] B;
-  delete [] C;
-  delete [] C_sol;
-}
-
-REGISTER_TYPED_TEST_SUITE_P(MatMultiplyScalar_gpu, shmem);
-
-#endif //defined(RAJA_ENABLE_HIP)
-
-using SeqTypes2 =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::SizeList<TILE_DIM, TILE_DIM>,
-    RAJA::SizeList<TILE_DIM, TILE_DIM>,
-    RAJA::KernelPolicy<
-      RAJA::statement::For<4, RAJA::loop_exec,
-        RAJA::statement::For<3, RAJA::loop_exec,
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2,1,0>,
-
-            //Initalize thread private value
-           RAJA::statement::For<1, RAJA::loop_exec,
-              RAJA::statement::For<0, RAJA::loop_exec,
-                                   RAJA::statement::Lambda<0> > >,
-
-            //Slide window across matrix
-             RAJA::statement::For<2, RAJA::loop_exec,
-
-               //Load matrix into tile
-               RAJA::statement::For<1, RAJA::loop_exec,
-                 RAJA::statement::For<0, RAJA::loop_exec,
-                  RAJA::statement::Lambda<1>
-                >
-               >,
-               //Partial multiplication
-               RAJA::statement::For<1, RAJA::loop_exec,
-                 RAJA::statement::For<0, RAJA::loop_exec,
-                  RAJA::statement::Lambda<2>
-                >
-               >
-            >, //sliding window
-
-            //Write memory out to global matrix
-            RAJA::statement::For<1, RAJA::loop_exec,
-              RAJA::statement::For<0, RAJA::loop_exec,
-                                   RAJA::statement::Lambda<3> > >
-         > //Create shared memory
-        >//For 3
-       >//For 4
-      >, //close kernel policy
-    //
-    //Policy for matrix multiplication using a scalar as accumulator
-    //
-    RAJA::KernelPolicy<
-      RAJA::statement::For<4, RAJA::loop_exec,
-        RAJA::statement::For<3, RAJA::loop_exec,
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<1,0>,
-            //Initalize thread private value
-
-            //Slide window across matrix
-             RAJA::statement::For<2, RAJA::loop_exec,
-
-               //Load matrix into tile
-               RAJA::statement::For<1, RAJA::loop_exec,
-                 RAJA::statement::For<0, RAJA::loop_exec,
-                                      RAJA::statement::Lambda<1>
-                >
-               >,
-               //Partial multiplication
-               RAJA::statement::For<1, RAJA::loop_exec,
-                 RAJA::statement::For<0, RAJA::loop_exec,
-                  RAJA::statement::Lambda<0>, //set pVal = 0
-                  RAJA::statement::Lambda<2>, //dot product
-                  RAJA::statement::Lambda<3> //write partial product out
-                >
-               >
-            > //sliding window
-
-         > //Create shared memory
-        >//For 3
-       >//For 4
-      > //close kernel policy
-    > //close list
-  >;//close types
-
-INSTANTIATE_TYPED_TEST_SUITE_P(Seq, MatMultiply, SeqTypes2);
-INSTANTIATE_TYPED_TEST_SUITE_P(Seq, MatMultiplyScalar, SeqTypes2);
-
-#if defined(RAJA_ENABLE_OPENMP)
-using OmpTypes2 =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::SizeList<TILE_DIM, TILE_DIM>,
-    RAJA::SizeList<TILE_DIM, TILE_DIM>,
-    RAJA::KernelPolicy<
-      RAJA::statement::For<4, RAJA::loop_exec,
-        RAJA::statement::For<3, RAJA::loop_exec,
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2,1,0>,
-            //Initalize thread private value
-            RAJA::statement::For<1, RAJA::loop_exec,
-              RAJA::statement::For<0, RAJA::loop_exec,
-                                   RAJA::statement::Lambda<0> > >,
-
-            //Slide window across matrix
-             RAJA::statement::For<2, RAJA::loop_exec,
-
-               //Load matrix into tile
-               RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                     RAJA::ArgList<0, 1>,
-                                     RAJA::statement::Lambda<1>
-                                     >,
-
-             //perform matrix multiplcation
-             RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                      RAJA::ArgList<0, 1>,
-                                      RAJA::statement::Lambda<2>
-                                      >
-            >, //sliding window
-
-            //Write memory out to global matrix
-            RAJA::statement::For<1, RAJA::loop_exec,
-              RAJA::statement::For<0, RAJA::loop_exec,
-                                   RAJA::statement::Lambda<3> > >
-         > //Create shared memory
-        >//For 3
-       >//For 4
-      >, //close kernel policy
-    //Policy for matrix multiply with a scalar as accumulator
-    RAJA::KernelPolicy<
-      RAJA::statement::For<4, RAJA::loop_exec,
-        RAJA::statement::For<3, RAJA::loop_exec,
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<1,0>,
-
-            //Slide window across matrix
-             RAJA::statement::For<2, RAJA::loop_exec,
-
-               //Load matrix into tile
-               RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                     RAJA::ArgList<0, 1>,
-                                     RAJA::statement::Lambda<1>
-                                     >,
-
-             //perform matrix multiplcation
-             RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                      RAJA::ArgList<0, 1>,
-                                       RAJA::statement::Lambda<0>,
-                                       RAJA::statement::Lambda<2>,
-                                       RAJA::statement::Lambda<3>
-                                      >
-            > //sliding window
-
-         > //Create shared memory
-        >//For 3
-       >//For 4
-      > //close kernel policy
-    > //close list
-  >;//close types
-
-INSTANTIATE_TYPED_TEST_SUITE_P(OpenMP, MatMultiply, OmpTypes2);
-INSTANTIATE_TYPED_TEST_SUITE_P(OpenMP, MatMultiplyScalar, OmpTypes2);
-#endif
-
-
-#if defined(RAJA_ENABLE_CUDA)
-using CudaTypes2 =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::SizeList<TILE_DIM, TILE_DIM>,
-    RAJA::SizeList<TILE_DIM, TILE_DIM>,
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-      RAJA::statement::For<4, RAJA::cuda_block_y_loop,
-        RAJA::statement::For<3, RAJA::cuda_block_x_loop,
-          RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<2,1,0>,
-            //Initalize thread private value
-            RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-              RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                RAJA::statement::Lambda<0> > >,
-
-            //Slide window across matrix
-             RAJA::statement::For<2, RAJA::seq_exec,
-
-              //Load matrix into tile
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<1> > >,
-              //perform matrix multiplcation
-              RAJA::statement::CudaSyncThreads,
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<2> > >,
-              RAJA::statement::CudaSyncThreads
-            >, //sliding window
-
-            //Write memory out to global matrix
-            RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-              RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                RAJA::statement::Lambda<3> > >
-         > //Create shared memory
-        >//For 3
-       >//For 4
-        > //CudaKernel
-      >, //close kernel policy
-    //Policy for Matrix multiply with a scalar as accumulator
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-      RAJA::statement::For<4, RAJA::cuda_block_y_loop,
-        RAJA::statement::For<3, RAJA::cuda_block_x_loop,
-          RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<1,0>,
-
-            //Intialize thread private value to zero
-            RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-              RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                RAJA::statement::Lambda<0> > >,
-
-            //Slide window across matrix
-            RAJA::statement::For<2, RAJA::seq_exec,
-
-               //Load matrix into tile
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<1> > >,
-
-              //perform matrix multiplcation
-              RAJA::statement::CudaSyncThreads,
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<2> > >,
-              RAJA::statement::CudaSyncThreads
-            >, //sliding window
-
-            //Write memory out to global matrix
-            RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-              RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                RAJA::statement::Lambda<3> > >
-         > //Create shared memory
-        >//For 3
-       >//For 4
-      > //CudaKernel
-     > //close kernel policy
-    > //close list
-  >;//close types
-
-INSTANTIATE_TYPED_TEST_SUITE_P(CUDAShmem, MatMultiply, CudaTypes2);
-INSTANTIATE_TYPED_TEST_SUITE_P(CUDAShmem, MatMultiplyScalar, CudaTypes2);
-
-using CudaTypes3 =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::SizeList<TILE_DIM, TILE_DIM>,
-    RAJA::SizeList<0,0>,
-    //Policy for Matrix multiply with a scalar
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-      RAJA::statement::For<4, RAJA::cuda_block_y_loop,
-        RAJA::statement::For<3, RAJA::cuda_block_x_loop,
-          RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<0,1>,
-            RAJA::statement::InitLocalMem<RAJA::cuda_thread_mem, RAJA::ParamList<2>,
-            //Initalize thread private value
-            RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-              RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                                   RAJA::statement::Lambda<0> > >,
-
-            //Slide window across matrix
-            RAJA::statement::For<2, RAJA::seq_exec,
-
-              //Load matrix into tile
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<1>
-                                   >
-                                 >,
-              //perform matrix multiplcation
-              RAJA::statement::CudaSyncThreads,
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<2> > >,
-              RAJA::statement::CudaSyncThreads
-            >, //sliding window
-
-            //Write memory out to global matrix
-            RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-              RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                RAJA::statement::Lambda<3> > >
-            > //Create private memory
-          > //Create shared memory
-        >//For 3
-      >//For 4
-      > //CudaKernel
-    > //close kernel policy
-  > //close list
-  >;//close types
-
-INSTANTIATE_TYPED_TEST_SUITE_P(CUDAShmemPriv, MatMultiply, CudaTypes3);
-
-
-#endif
-
-#if defined(RAJA_ENABLE_HIP)
-using HipTypes2 =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::SizeList<TILE_DIM, TILE_DIM>,
-    RAJA::SizeList<TILE_DIM, TILE_DIM>,
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-      RAJA::statement::For<4, RAJA::hip_block_y_loop,
-        RAJA::statement::For<3, RAJA::hip_block_x_loop,
-          RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<2,1,0>,
-            //Initalize thread private value
-            RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-              RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                                   RAJA::statement::Lambda<0> > >,
-
-            //Slide window across matrix
-             RAJA::statement::For<2, RAJA::seq_exec,
-
-               //Load matrix into tile
-              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                  RAJA::statement::Lambda<1>
-                                   >
-                                 >,
-             //perform matrix multiplcation
-              RAJA::statement::HipSyncThreads,
-                RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                  RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                    RAJA::statement::Lambda<2> > >
-                     ,RAJA::statement::HipSyncThreads
-            >, //sliding window
-            //Write memory out to global matrix
-            RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-              RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                                   RAJA::statement::Lambda<3> > >
-         > //Create shared memory
-        >//For 3
-       >//For 4
-        > //HipKernel
-      >, //close kernel policy
-    //Policy for Matrix multiply with a scalar as accumulator
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-      RAJA::statement::For<4, RAJA::hip_block_y_loop,
-        RAJA::statement::For<3, RAJA::hip_block_x_loop,
-          RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<1,0>,
-
-            //Intialize thread private value to zero
-            RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-              RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                                   RAJA::statement::Lambda<0> > >,
-
-            //Slide window across matrix
-             RAJA::statement::For<2, RAJA::seq_exec,
-
-               //Load matrix into tile
-              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                  RAJA::statement::Lambda<1>
-                                   >
-                                 >,
-             //perform matrix multiplcation
-              RAJA::statement::HipSyncThreads,
-                RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                  RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                    RAJA::statement::Lambda<2> > >
-                     ,RAJA::statement::HipSyncThreads
-           >, //sliding window
-            //Write memory out to global matrix
-            RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-              RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                                   RAJA::statement::Lambda<3> > >
-         > //Create shared memory
-        >//For 3
-       >//For 4
-      > //HipKernel
-     > //close kernel policy
-    > //close list
-  >;//close types
-
-INSTANTIATE_TYPED_TEST_SUITE_P(HIPShmem, MatMultiply_gpu, HipTypes2);
-INSTANTIATE_TYPED_TEST_SUITE_P(HIPShmem, MatMultiplyScalar_gpu, HipTypes2);
-
-using HipTypes3 =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::SizeList<TILE_DIM, TILE_DIM>,
-    RAJA::SizeList<0,0>,
-    //Policy for Matrix multiply with a scalar
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-      RAJA::statement::For<4, RAJA::hip_block_y_loop,
-        RAJA::statement::For<3, RAJA::hip_block_x_loop,
-          RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<0,1>,
-            RAJA::statement::InitLocalMem<RAJA::hip_thread_mem, RAJA::ParamList<2>,
-            //Initalize thread private value
-            RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-              RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                                   RAJA::statement::Lambda<0> > >,
-
-            //Slide window across matrix
-             RAJA::statement::For<2, RAJA::seq_exec,
-
-               //Load matrix into tile
-              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                  RAJA::statement::Lambda<1>
-                                   >
-                                 >,
-             //perform matrix multiplcation
-              RAJA::statement::HipSyncThreads,
-                RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                  RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                    RAJA::statement::Lambda<2> > >
-                     ,RAJA::statement::HipSyncThreads
-            >, //sliding window
-            //Write memory out to global matrix
-            RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-              RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                                   RAJA::statement::Lambda<3> > >
-         > //Create private memory
-        > //Create shared memory
-        >//For 3
-       >//For 4
-       > //HipKernel
-      > //close kernel policy
-    > //close list
-  >;//close types
-
-INSTANTIATE_TYPED_TEST_SUITE_P(HIPShmemPriv, MatMultiply_gpu, HipTypes3);
-
-#endif
diff --git a/test/unit/test-span.cpp b/test/unit/test-span.cpp
deleted file mode 100644
index 10d35c6fa4..0000000000
--- a/test/unit/test-span.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for Span
-///
-
-#include "RAJA/internal/Span.hpp"
-#include "RAJA_gtest.hpp"
-
-TEST(Span, basic)
-{
-  int data[4] = {0, 1, 2, 3};
-  auto span = RAJA::impl::make_span(data, 4);
-  ASSERT_EQ(0, *span.begin());
-  ASSERT_EQ(0, *span.data());
-  ASSERT_EQ(3, *(span.data() + 3));
-  ASSERT_EQ(3, *(span.end() - 1));
-
-  ASSERT_EQ(0, *span.cbegin());
-  ASSERT_EQ(0, *span.data());
-  ASSERT_EQ(3, *(span.data() + 3));
-  ASSERT_EQ(3, *(span.cend() - 1));
-
-  auto const cspan = span;
-  ASSERT_EQ(0, *cspan.begin());
-  ASSERT_EQ(3, *(cspan.end() - 1));
-
-  ASSERT_FALSE(cspan.empty());
-  ASSERT_EQ(4, cspan.size());
-  ASSERT_EQ(4, cspan.max_size());
-
-  auto const empty = RAJA::impl::make_span((int*)nullptr, 0);
-  ASSERT_TRUE(empty.empty());
-  ASSERT_EQ(0, empty.size());
-}
diff --git a/test/unit/test-view.cpp b/test/unit/test-view.cpp
deleted file mode 100644
index 382ad05de6..0000000000
--- a/test/unit/test-view.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for basic view operations
-///
-
-#include "RAJA/RAJA.hpp"
-#include "gtest/gtest.h"
-
-RAJA_INDEX_VALUE(TX, "TX");
-RAJA_INDEX_VALUE(TIX, "TIX");
-RAJA_INDEX_VALUE(TIY, "TIY");
-RAJA_INDEX_VALUE(TIL, "TIL");
-
-TEST(ViewTest, Const)
-{
-  using layout = RAJA::Layout<1>;
-
-  double data[10];
-  RAJA::View<double, layout> view(data, layout(10));
-
-  /*
-   * Should be able to construct a non-const View from a non-const View
-   */
-  RAJA::View<double, layout> view2(view);
-
-  /*
-   * Should be able to construct a const View from a non-const View
-   */
-  RAJA::View<double const, layout> const_view(view);
-
-  /*
-   * Should be able to construct a const View from a const View
-   */
-  RAJA::View<double const, layout> const_view2(const_view);
-}
-
-TEST(ViewTest, Shift1D)
-{
-
-  int N = 10;
-  int *a = new int[N];
-  int *b = new int[N];
-
-  const int DIM = 1;
-  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}},{{N-1}});
-  RAJA::View<int, RAJA::OffsetLayout<DIM>> A(a,layout);
-  RAJA::View<int, RAJA::Layout<DIM>> B(a,N);
-  RAJA::TypedView<int, RAJA::Layout<DIM>,TX> C(a,N);
-
-  for(int i=0; i<N; ++i) {
-    A(i) = i + 1;
-  }
-
-  RAJA::View<int, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N}});
-  RAJA::View<int, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N}});
-
-  //TypedView
-  RAJA::TypedView<int, RAJA::OffsetLayout<DIM>, TX> Cshift = C.shift({{N}});
-
-  for(int i=N; i<2*N; ++i) {
-    ASSERT_EQ(Ashift(i),A(i-N));
-    ASSERT_EQ(Bshift(i),B(i-N));
-  }
-
-
-  RAJA::forall<RAJA::loop_exec> (RAJA::TypedRangeSegment<TX>(N,2*N), [=] (TX tx) {
-      ASSERT_EQ(Cshift(tx),C(tx-N));
-  });
-
-  //TypedOffsetLayout + View
-  using TLayout = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
-  using TOffsetLayout = RAJA::TypedOffsetLayout<TIL, RAJA::tuple<TIX>>;
-
-  TLayout myLayout(10);
-
-  RAJA::View<int, TLayout> D(a, myLayout);
-  RAJA::View<int, TOffsetLayout> Dshift = D.shift({{N}});
-
-  RAJA::forall<RAJA::loop_exec> (RAJA::TypedRangeSegment<TIX>(N,2*N), [=] (TIX i) {
-      ASSERT_EQ(Dshift(i),D(i-N));
-    });
-
-  delete[] a;
-  delete[] b;
-}
-
-TEST(ViewTest, Shift2D)
-{
-
-  int N = 10;
-  int *a = new int[N*N];
-  int *b = new int[N*N];
-
-  const int DIM = 2;
-  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0,0}},{{N-1,N-1}});
-  RAJA::View<int, RAJA::OffsetLayout<DIM>> A(a,layout);
-  RAJA::View<int, RAJA::Layout<DIM>> B(a,N,N);
-
-  for(int y=0; y<N; ++y) {
-    for(int x=0; x<N; ++x) {
-      A(y,x) = x + N*y;
-    }
-  }
-
-  //shift view
-  RAJA::View<int, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N,N}});
-  RAJA::View<int, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N,N}});
-
-  for(int y=N; y<N+N; ++y) {
-    for(int x=N; x<N+N; ++x) {
-      ASSERT_EQ(Ashift(y,x),A(y-N,x-N));
-      ASSERT_EQ(Bshift(y,x),B(y-N,x-N));
-    }
-  }
-
-
-  //Permuted layout
-  std::array< RAJA::idx_t, 2> perm {{1, 0}};
-  RAJA::OffsetLayout<2> playout =
-    RAJA::make_permuted_offset_layout<2>( {{0, 0}}, {{N-1, N-1}}, perm );
-
-  RAJA::View<int, RAJA::OffsetLayout<DIM>> C(a, playout);
-  RAJA::View<int, RAJA::OffsetLayout<DIM>> Cshift = C.shift({{N,N}});
-
-  for(int y=N; y<N+N; ++y) {
-    for(int x=N; x<N+N; ++x) {
-      ASSERT_EQ(Cshift(y,x),C(y-N,x-N));
-    }
-  }
-
-  delete[] a;
-  delete[] b;
-}
diff --git a/test/unit/util/CMakeLists.txt b/test/unit/util/CMakeLists.txt
new file mode 100644
index 0000000000..11c186bbb7
--- /dev/null
+++ b/test/unit/util/CMakeLists.txt
@@ -0,0 +1,26 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+raja_add_test(
+  NAME test-float-limits
+  SOURCES test-float-limits.cpp)
+
+raja_add_test(
+  NAME test-integral-limits
+  SOURCES test-integral-limits.cpp)
+
+raja_add_test(
+  NAME test-operators
+  SOURCES test-operators.cpp)
+
+raja_add_test(
+  NAME test-timer
+  SOURCES test-timer.cpp)
+
+raja_add_test(
+  NAME test-span
+  SOURCES test-span.cpp)
diff --git a/test/unit/util/test-float-limits.cpp b/test/unit/util/test-float-limits.cpp
new file mode 100644
index 0000000000..472baf018f
--- /dev/null
+++ b/test/unit/util/test-float-limits.cpp
@@ -0,0 +1,42 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for floating point numeric limits in 
+/// RAJA operators
+///
+
+#include "RAJA_test-base.hpp"
+#include "RAJA_unit-test-types.hpp" 
+
+#define RAJA_CHECK_LIMITS
+#include "RAJA/util/Operators.hpp"
+
+#include <limits>
+
+template <typename T>
+class FloatLimitsUnitTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_SUITE_P(FloatLimitsUnitTest);
+
+TYPED_TEST_P(FloatLimitsUnitTest, FloatLimits)
+{
+#if !defined(RAJA_ENABLE_TARGET_OPENMP)
+  ASSERT_EQ(RAJA::operators::limits<TypeParam>::min(),
+            -std::numeric_limits<TypeParam>::max());
+  ASSERT_EQ(RAJA::operators::limits<TypeParam>::max(),
+            std::numeric_limits<TypeParam>::max());
+#endif
+}
+
+REGISTER_TYPED_TEST_SUITE_P(FloatLimitsUnitTest, FloatLimits);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(FloatLimitsUnitTests,
+                               FloatLimitsUnitTest,
+                               UnitFloatTypes);
diff --git a/test/unit/util/test-integral-limits.cpp b/test/unit/util/test-integral-limits.cpp
new file mode 100644
index 0000000000..2cdeef00be
--- /dev/null
+++ b/test/unit/util/test-integral-limits.cpp
@@ -0,0 +1,39 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for numeric limits in RAJA operators
+///
+
+#include "RAJA_test-base.hpp"
+#include "RAJA_unit-test-types.hpp"
+
+#define RAJA_CHECK_LIMITS
+#include "RAJA/util/Operators.hpp"
+
+#include <limits>
+
+template <typename T>
+class IntegralLimitsUnitTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_SUITE_P(IntegralLimitsUnitTest);
+
+TYPED_TEST_P(IntegralLimitsUnitTest, IntegralLimits)
+{
+  ASSERT_EQ(RAJA::operators::limits<TypeParam>::min(),
+            std::numeric_limits<TypeParam>::min());
+  ASSERT_EQ(RAJA::operators::limits<TypeParam>::max(),
+            std::numeric_limits<TypeParam>::max());
+}
+
+REGISTER_TYPED_TEST_SUITE_P(IntegralLimitsUnitTest, IntegralLimits);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(IntegralLimitsUnitTests,
+                              IntegralLimitsUnitTest,
+                              UnitIntegralTypes);
diff --git a/test/unit/util/test-operators.cpp b/test/unit/util/test-operators.cpp
new file mode 100644
index 0000000000..ebd7bb721c
--- /dev/null
+++ b/test/unit/util/test-operators.cpp
@@ -0,0 +1,468 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing unit tests for Operators.
+///
+
+#include "RAJA_test-base.hpp"
+#include "RAJA_unit-test-types.hpp"
+
+template<typename T>
+class OperatorsUnitTest : public ::testing::Test {};
+template<typename T>
+class OperatorsIntegralUnitTest : public ::testing::Test {};
+
+TYPED_TEST_SUITE(OperatorsUnitTest, UnitIntFloatTypes);
+TYPED_TEST_SUITE(OperatorsIntegralUnitTest, UnitExpandedIntegralTypes);
+
+template<typename T>
+void plus_test()
+{
+  using Plus = RAJA::operators::plus<T>;
+  auto ident = Plus::identity();
+  ASSERT_EQ(ident, T(0));
+
+  Plus p;
+  T i = static_cast<T>(1);
+  T j = static_cast<T>(2);
+  ASSERT_EQ(p(i,j), T(3));
+
+  if (std::is_signed<T>::value) {
+    i = static_cast<T>(-5);
+    j = static_cast<T>(-2);
+    ASSERT_EQ(p(i,j), T(-7));
+  }
+}
+
+template<typename T>
+void minus_test()
+{
+  using Minus = RAJA::operators::minus<T>;
+
+  Minus m;
+  T i = static_cast<T>(5);
+  T j = static_cast<T>(2);
+  ASSERT_EQ(m(i,j), T(3));
+
+  if (std::is_signed<T>::value) {
+    i = static_cast<T>(-5);
+    j = static_cast<T>(-2);
+    ASSERT_EQ(m(i,j), T(-3));
+  }
+}
+
+template<typename T>
+void multiplies_test()
+{
+  using Mult = RAJA::operators::multiplies<T>;
+  auto ident = Mult::identity();
+  ASSERT_EQ(ident, T(1));
+
+  Mult m;
+  T i = static_cast<T>(5);
+  T j = static_cast<T>(2);
+  ASSERT_EQ(m(i,j), T(10));
+
+  if (std::is_signed<T>::value) {
+    i = static_cast<T>(-5);
+    j = static_cast<T>(-2);
+    ASSERT_EQ(m(i,j), T(10));
+  }
+}
+
+template<typename T>
+void divides_test()
+{
+  using Div = RAJA::operators::divides<T>;
+
+  Div d;
+  T i = static_cast<T>(5);
+  T j = static_cast<T>(2);
+  if(std::is_floating_point<T>::value) 
+    ASSERT_EQ(d(i,j), T(2.5));
+  else
+    ASSERT_EQ(d(i,j), T(2));
+
+  if (std::is_signed<T>::value) {
+    i = static_cast<T>(-5);
+    j = static_cast<T>(-2);
+    if(std::is_floating_point<T>::value) 
+      ASSERT_EQ(d(i,j), T(2.5));
+    else
+      ASSERT_EQ(d(i,j), T(2));
+  }
+}
+
+template<typename T>
+void modulus_test()
+{
+  using Mod = RAJA::operators::modulus<T>;
+
+  Mod m;
+  T i = static_cast<T>(5);
+  T j = static_cast<T>(2);
+  ASSERT_EQ(m(i,j), T(1));
+
+  if (std::is_signed<T>::value) {
+    i = static_cast<T>(-5);
+    j = static_cast<T>(-2);
+    ASSERT_EQ(m(i,j), T(-1));
+  }
+}
+
+template<typename T>
+void logical_and_test()
+{
+  using And = RAJA::operators::logical_and<T>;
+
+  And a;
+  T i0 = static_cast<T>(0);
+  T i1 = static_cast<T>(1);
+  T i2 = static_cast<T>(2);
+  T j0 = static_cast<T>(0);
+  T j1 = static_cast<T>(1);
+  T j2 = static_cast<T>(2);
+  ASSERT_FALSE(a(i0,j0));
+  ASSERT_FALSE(a(i0,j1));
+  ASSERT_FALSE(a(i1,j0));
+  ASSERT_TRUE(a(i1,j1));
+  ASSERT_TRUE(a(i2,j2));
+  if (std::is_signed<T>::value) {
+    i1 = static_cast<T>(-1);
+    j1 = static_cast<T>(-1);
+    ASSERT_FALSE(a(i0,j1));
+    ASSERT_FALSE(a(i1,j0));
+    ASSERT_TRUE(a(i1,j1));
+  }
+}
+
+template<typename T>
+void logical_or_test()
+{
+  using Or = RAJA::operators::logical_or<T>;
+
+  Or o;
+  T i0 = static_cast<T>(0);
+  T i1 = static_cast<T>(1);
+  T i2 = static_cast<T>(2);
+  T j0 = static_cast<T>(0);
+  T j1 = static_cast<T>(1);
+  T j2 = static_cast<T>(2);
+  ASSERT_FALSE(o(i0,j0));
+  ASSERT_TRUE(o(i0,j1));
+  ASSERT_TRUE(o(i1,j0));
+  ASSERT_TRUE(o(i1,j1));
+  ASSERT_TRUE(o(i2,j2));
+  if (std::is_signed<T>::value) {
+    i1 = static_cast<T>(-1);
+    j1 = static_cast<T>(-1);
+    ASSERT_TRUE(o(i0,j1));
+    ASSERT_TRUE(o(i1,j0));
+    ASSERT_TRUE(o(i1,j1));
+  }
+}
+
+template<typename T>
+void logical_not_test()
+{
+  using Not = RAJA::operators::logical_not<T>;
+
+  Not n;
+  T i0 = static_cast<T>(0);
+  T i1 = static_cast<T>(1);
+  ASSERT_FALSE(n(i1));
+  ASSERT_TRUE(n(i0));
+  if (std::is_signed<T>::value) {
+    i1 = static_cast<T>(-1);
+    ASSERT_FALSE(n(i1));
+  }
+}
+
+template<typename T>
+void bit_or_test()
+{
+  using Or = RAJA::operators::bit_or<T>;
+
+  Or o;
+  T i = static_cast<T>(0010);
+  T j = static_cast<T>(0001);
+  T k = static_cast<T>(0111);
+  ASSERT_EQ(o(i,j), T(0011));
+  ASSERT_EQ(o(i,k), T(0111));
+  ASSERT_EQ(o(j,k), T(0111));
+}
+
+template<typename T>
+void bit_and_test()
+{
+  using And = RAJA::operators::bit_and<T>;
+
+  And a;
+  T i = static_cast<T>(0010);
+  T j = static_cast<T>(0001);
+  T k = static_cast<T>(0111);
+  ASSERT_EQ(a(i,j), T(0000));
+  ASSERT_EQ(a(i,k), T(0010));
+  ASSERT_EQ(a(j,k), T(0001));
+}
+
+template<typename T>
+void bit_xor_test()
+{
+  using Xor = RAJA::operators::bit_xor<T>;
+
+  Xor x;
+  T i = static_cast<T>(0010);
+  T j = static_cast<T>(0001);
+  T k = static_cast<T>(0111);
+  ASSERT_EQ(x(i,j), T(0011));
+  ASSERT_EQ(x(i,k), T(0101));
+  ASSERT_EQ(x(j,k), T(0110));
+}
+
+template<typename T>
+void maximum_test()
+{
+  using Max = RAJA::operators::maximum<T>;
+
+  Max m;
+  T i = static_cast<T>(5);
+  T j = static_cast<T>(2);
+  ASSERT_EQ(m(i,j), i);
+
+  if (std::is_signed<T>::value) {
+    i = static_cast<T>(-5);
+    j = static_cast<T>(-2);
+    ASSERT_EQ(m(i,j), j);
+  }
+}
+
+
+template<typename T>
+void minimum_test()
+{
+  using Min = RAJA::operators::minimum<T>;
+
+  Min m;
+  T i = static_cast<T>(5);
+  T j = static_cast<T>(2);
+  ASSERT_EQ(m(i,j), j);
+
+  if (std::is_signed<T>::value) {
+    i = static_cast<T>(-5);
+    j = static_cast<T>(-2);
+    ASSERT_EQ(m(i,j), i);
+  }
+}
+
+
+template<typename T>
+void equal_test()
+{
+  using Eq = RAJA::operators::equal_to<T>;
+
+  Eq eq;
+  T i = static_cast<T>(5);
+  T j = static_cast<T>(5);
+  ASSERT_TRUE(eq(i,j));
+
+  if (std::is_signed<T>::value) {
+    i = static_cast<T>(-5);
+    j = static_cast<T>(-5);
+    ASSERT_TRUE(eq(i,j));
+  }
+}
+
+template<typename T>
+void not_equal_test()
+{
+  using NEq = RAJA::operators::not_equal_to<T>;
+
+  NEq neq;
+  T i = static_cast<T>(5);
+  T j = static_cast<T>(3);
+  ASSERT_TRUE(neq(i,j));
+
+  if (std::is_signed<T>::value) {
+    i = static_cast<T>(-5);
+    j = static_cast<T>(-3);
+    ASSERT_TRUE(neq(i,j));
+  }
+}
+
+template<typename T>
+void greater_test()
+{
+  using G = RAJA::operators::greater<T>;
+
+  G g;
+  T i = static_cast<T>(5);
+  T j = static_cast<T>(4);
+  ASSERT_TRUE(g(i,j));
+  ASSERT_FALSE(g(j,i));
+
+  if (std::is_signed<T>::value) {
+    i = static_cast<T>(-4);
+    j = static_cast<T>(-5);
+    ASSERT_TRUE(g(i,j));
+    ASSERT_FALSE(g(j,i));
+  }
+}
+
+template<typename T>
+void less_test()
+{
+  using L = RAJA::operators::less<T>;
+
+  L l;
+  T i = static_cast<T>(4);
+  T j = static_cast<T>(5);
+  ASSERT_TRUE(l(i,j));
+  ASSERT_FALSE(l(j,i));
+
+  if (std::is_signed<T>::value) {
+    i = static_cast<T>(-5);
+    j = static_cast<T>(-4);
+    ASSERT_TRUE(l(i,j));
+    ASSERT_FALSE(l(j,i));
+  }
+}
+
+template<typename T>
+void greater_eq_test()
+{
+  using G = RAJA::operators::greater_equal<T>;
+
+  G g;
+  T i = static_cast<T>(5);
+  T i2 = static_cast<T>(5);
+  T j = static_cast<T>(4);
+  ASSERT_TRUE(g(i,j));
+  ASSERT_TRUE(g(i,i2));
+  ASSERT_FALSE(g(j,i));
+
+  if (std::is_signed<T>::value) {
+    i = static_cast<T>(-4);
+    i2 = static_cast<T>(-4);
+    j = static_cast<T>(-5);
+    ASSERT_TRUE(g(i,j));
+    ASSERT_TRUE(g(i,i2));
+    ASSERT_FALSE(g(j,i));
+  }
+}
+
+template<typename T>
+void less_eq_test()
+{
+  using L = RAJA::operators::less_equal<T>;
+
+  L l;
+  T i = static_cast<T>(4);
+  T i2 = static_cast<T>(4);
+  T j = static_cast<T>(5);
+  ASSERT_TRUE(l(i,j));
+  ASSERT_TRUE(l(i,i2));
+  ASSERT_FALSE(l(j,i));
+
+  if (std::is_signed<T>::value) {
+    i = static_cast<T>(-5);
+    i2 = static_cast<T>(-5);
+    j = static_cast<T>(-4);
+    ASSERT_TRUE(l(i,j));
+    ASSERT_TRUE(l(i,i2));
+    ASSERT_FALSE(l(j,i));
+  }
+}
+
+template<typename T>
+void identity_test()
+{
+  using Ident = RAJA::operators::identity<T>;
+
+  Ident id;
+  T i = static_cast<T>(0);
+  T j = static_cast<T>(1);
+  ASSERT_EQ(id(i), T(0));
+  ASSERT_EQ(id(j), T(1));
+
+  if (std::is_signed<T>::value) {
+    j = static_cast<T>(-1);
+    ASSERT_EQ(id(j), T(-1));
+  }
+}
+
+template<typename T>
+void project1st_test()
+{
+  using Proj1 = RAJA::operators::project1st<T, T>;
+
+  Proj1 p;
+  T i = static_cast<T>(0);
+  T j = static_cast<T>(1);
+  ASSERT_EQ(p(i,j), T(0));
+  ASSERT_EQ(p(j,i), T(1));
+
+  if (std::is_signed<T>::value) {
+    j = static_cast<T>(-1);
+    ASSERT_EQ(p(i,j), T(0));
+    ASSERT_EQ(p(j,i), T(-1));
+  }
+}
+
+template<typename T>
+void project2nd_test()
+{
+  using Proj2 = RAJA::operators::project2nd<T, T>;
+
+  Proj2 p;
+  T i = static_cast<T>(0);
+  T j = static_cast<T>(1);
+  ASSERT_EQ(p(i,j), T(1));
+  ASSERT_EQ(p(j,i), T(0));
+
+#ifdef RAJA_COMPILER_MSVC
+#pragma warning( disable : 4245 )  // Force msvc to not emit signed conversion warning
+#endif
+  if (std::is_signed<T>::value) {
+    j = static_cast<T>(-1);
+    ASSERT_EQ(p(i,j), T(-1));
+    ASSERT_EQ(p(j,i), T(0));
+  }
+#ifdef RAJA_COMPILER_MSVC
+#pragma warning( default : 4245 )
+#endif
+}
+
+TYPED_TEST(OperatorsUnitTest, plus) { plus_test<TypeParam>(); }
+TYPED_TEST(OperatorsUnitTest, minus) { minus_test<TypeParam>(); }
+TYPED_TEST(OperatorsUnitTest, multiplies) { multiplies_test<TypeParam>(); }
+TYPED_TEST(OperatorsUnitTest, divides) { divides_test<TypeParam>(); }
+
+TYPED_TEST(OperatorsIntegralUnitTest, modulus) { modulus_test<TypeParam>(); }
+
+TYPED_TEST(OperatorsUnitTest, logical_and) { logical_and_test<TypeParam>(); }
+TYPED_TEST(OperatorsUnitTest, logical_or) { logical_or_test<TypeParam>(); }
+TYPED_TEST(OperatorsUnitTest, logical_not) { logical_not_test<TypeParam>(); }
+
+TYPED_TEST(OperatorsIntegralUnitTest, bit_or) { bit_or_test<TypeParam>(); }
+TYPED_TEST(OperatorsIntegralUnitTest, bit_and) { bit_and_test<TypeParam>(); }
+TYPED_TEST(OperatorsIntegralUnitTest, bit_xor) { bit_xor_test<TypeParam>(); }
+
+TYPED_TEST(OperatorsUnitTest, minimum) { minimum_test<TypeParam>(); }
+TYPED_TEST(OperatorsUnitTest, maximum) { maximum_test<TypeParam>(); }
+
+TYPED_TEST(OperatorsUnitTest, equal_to) { equal_test<TypeParam>(); }
+TYPED_TEST(OperatorsUnitTest, not_equal_to) { not_equal_test<TypeParam>(); }
+TYPED_TEST(OperatorsUnitTest, greater) { greater_test<TypeParam>(); }
+TYPED_TEST(OperatorsUnitTest, less) { less_test<TypeParam>(); }
+TYPED_TEST(OperatorsUnitTest, greater_eq) { greater_eq_test<TypeParam>(); }
+TYPED_TEST(OperatorsUnitTest, less_eq) { less_eq_test<TypeParam>(); }
+
+TYPED_TEST(OperatorsUnitTest, identity) { identity_test<TypeParam>(); }
+TYPED_TEST(OperatorsUnitTest, project1st) { project1st_test<TypeParam>(); }
+TYPED_TEST(OperatorsUnitTest, project2nd) { project2nd_test<TypeParam>(); }
diff --git a/test/unit/util/test-span.cpp b/test/unit/util/test-span.cpp
new file mode 100644
index 0000000000..6034006290
--- /dev/null
+++ b/test/unit/util/test-span.cpp
@@ -0,0 +1,53 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for Span
+///
+
+#include "test-span.hpp"
+
+#define RAJA_SPAN_RUN_TEST(test) \
+  test<int, int>(); \
+  test<int, size_t>(); \
+  test<double, int>(); \
+  test<double, size_t>(); \
+
+TEST(Span, basic_construct_Span)
+{
+  RAJA_SPAN_RUN_TEST(testSpanConstructTypes)
+}
+
+TEST(Span, basic_assign_Span)
+{
+  RAJA_SPAN_RUN_TEST(testSpanAssignTypes)
+}
+
+TEST(Span, basic_iterator_Span)
+{
+  RAJA_SPAN_RUN_TEST(testSpanIteratorTypes)
+}
+
+TEST(Span, basic_element_access_Span)
+{
+  RAJA_SPAN_RUN_TEST(testSpanElementAccessTypes)
+}
+
+TEST(Span, basic_observe_Span)
+{
+  RAJA_SPAN_RUN_TEST(testSpanObserveTypes)
+}
+
+TEST(Span, basic_subview_Span)
+{
+  RAJA_SPAN_RUN_TEST(testSpanSubViewTypes)
+}
+
+TEST(Span, basic_make_span_Span)
+{
+  RAJA_SPAN_RUN_TEST(testSpanMakeSpanTypes)
+}
diff --git a/test/unit/util/test-span.hpp b/test/unit/util/test-span.hpp
new file mode 100644
index 0000000000..e76db861fd
--- /dev/null
+++ b/test/unit/util/test-span.hpp
@@ -0,0 +1,250 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for span
+///
+
+#include <RAJA/RAJA.hpp>
+#include "RAJA_gtest.hpp"
+#include <type_traits>
+
+
+template <typename ValueType, typename IndexType>
+void testSpanConstructTypes()
+{
+  constexpr IndexType len = 4;
+  ValueType* ptr = new ValueType[len];
+
+  {
+    const RAJA::Span<ValueType*, IndexType> span(ptr, len);
+
+    ASSERT_EQ(ptr, span.data());
+    ASSERT_EQ(len, span.size());
+  }
+
+  {
+    const RAJA::Span<ValueType*, IndexType> span(ptr, ptr+len);
+
+    ASSERT_EQ(ptr, span.data());
+    ASSERT_EQ(len, span.size());
+  }
+
+  delete[] ptr;
+}
+
+template <typename ValueType, typename IndexType>
+void testSpanAssignTypes()
+{
+  constexpr IndexType len = 4;
+  ValueType* ptr = new ValueType[len];
+
+  {
+    RAJA::Span<ValueType*, IndexType> span(ptr, len);
+    const RAJA::Span<ValueType*, IndexType> span2(ptr, len);
+    span = span2;
+
+    ASSERT_EQ(ptr, span.data());
+    ASSERT_EQ(len, span.size());
+  }
+
+  {
+    ValueType* ptr2 = ptr + 1;
+    constexpr IndexType len2 = 1;
+    RAJA::Span<ValueType*, IndexType> span(ptr, len);
+    const RAJA::Span<ValueType*, IndexType> span2(ptr2, len2);
+    span = span2;
+
+    ASSERT_EQ(ptr2, span.data());
+    ASSERT_EQ(len2, span.size());
+  }
+
+  delete[] ptr;
+}
+
+template <typename ValueType, typename IndexType>
+void testSpanIteratorTypes()
+{
+  using span_type = RAJA::Span<ValueType*, IndexType>;
+  using iterator = typename span_type::iterator;
+  using const_iterator = typename span_type::const_iterator;
+  constexpr IndexType len = 4;
+  ValueType* ptr = new ValueType[len];
+
+  // XL cannot handle initialization list with new
+  // e.g. new ValueType[len]{0,1,2,3} produces error
+  for ( IndexType ii = 0; ii < len; ++ii )
+  {
+    ptr[ii] = static_cast<ValueType>(ii);
+  }
+
+  {
+    const span_type span(ptr, len);
+
+    iterator begin = span.begin();
+    iterator end = span.end();
+    ASSERT_EQ(ptr, begin);
+    ASSERT_EQ(ptr+len, end);
+
+    ValueType* ptr_chk = ptr;
+
+    for (iterator iter = begin; iter != end; ++iter) {
+      ASSERT_EQ(*ptr_chk, *iter);
+      ptr_chk++ ;
+    }
+
+    const_iterator cbegin = span.cbegin();
+    const_iterator cend = span.cend();
+    ASSERT_EQ(ptr, cbegin);
+    ASSERT_EQ(ptr+len, cend);
+
+    ptr_chk = ptr;
+
+    for (iterator citer = cbegin; citer != cend; ++citer) {
+      ASSERT_EQ(*ptr_chk, *citer);
+      ptr_chk++ ;
+    }
+  }
+
+  delete[] ptr;
+}
+
+template <typename ValueType, typename IndexType>
+void testSpanElementAccessTypes()
+{
+  constexpr IndexType len = 4;
+  ValueType* ptr = new ValueType[len];
+
+  // XL cannot handle initialization list with new
+  // e.g. new ValueType[len]{0,1,2,3} produces error
+  for ( IndexType ii = 0; ii < len; ++ii )
+  {
+    ptr[ii] = static_cast<ValueType>(ii);
+  }
+
+  {
+    const RAJA::Span<ValueType*, IndexType> span(ptr, len);
+
+    ASSERT_EQ(ptr, span.data());
+    ASSERT_EQ(*ptr, span.front());
+    ASSERT_EQ(*(ptr+len-1), span.back());
+
+    for (IndexType i = 0; i < len; ++i) {
+      ASSERT_EQ(ptr[i], span[i]);
+    }
+  }
+
+  delete[] ptr;
+}
+
+template <typename ValueType, typename IndexType>
+void testSpanObserveTypes()
+{
+  constexpr IndexType len = 4;
+  ValueType* ptr = new ValueType[len];
+
+  // XL cannot handle initialization list with new
+  // e.g. new ValueType[len]{0,1,2,3} produces error
+  for ( IndexType ii = 0; ii < len; ++ii )
+  {
+    ptr[ii] = static_cast<ValueType>(ii);
+  }
+
+  {
+    const RAJA::Span<ValueType*, IndexType> span(ptr, len);
+
+    ASSERT_EQ(len, span.size());
+    ASSERT_FALSE(span.empty());
+  }
+
+  {
+    const RAJA::Span<ValueType*, IndexType> span(ptr, len-len);
+
+    ASSERT_EQ(0, span.size());
+    ASSERT_TRUE(span.empty());
+  }
+
+  delete[] ptr;
+}
+
+template <typename ValueType, typename IndexType>
+void testSpanSubViewTypes()
+{
+  constexpr IndexType len = 4;
+  ValueType* ptr = new ValueType[len];
+
+  // XL cannot handle initialization list with new
+  // e.g. new ValueType[len]{0,1,2,3} produces error
+  for ( IndexType ii = 0; ii < len; ++ii )
+  {
+    ptr[ii] = static_cast<ValueType>(ii);
+  }
+
+  {
+    constexpr IndexType count = 3;
+    const RAJA::Span<ValueType*, IndexType> span(ptr, len);
+    const RAJA::Span<ValueType*, IndexType> subspan = span.first(count);
+
+    ASSERT_EQ(count, subspan.size());
+    ASSERT_EQ(ptr, subspan.data());
+  }
+
+  {
+    constexpr IndexType count = 3;
+    const RAJA::Span<ValueType*, IndexType> span(ptr, len);
+    const RAJA::Span<ValueType*, IndexType> subspan = span.last(count);
+
+    ASSERT_EQ(count, subspan.size());
+    ASSERT_EQ(ptr+len-count, subspan.data());
+  }
+
+  {
+    constexpr IndexType begin = 1;
+    constexpr IndexType count = 2;
+    const RAJA::Span<ValueType*, IndexType> span(ptr, len);
+    const RAJA::Span<ValueType*, IndexType> subspan = span.subspan(begin, count);
+
+    ASSERT_EQ(count, subspan.size());
+    ASSERT_EQ(ptr+begin, subspan.data());
+  }
+
+  {
+    constexpr IndexType begin = 1;
+    constexpr IndexType count = 2;
+    const RAJA::Span<ValueType*, IndexType> span(ptr, len);
+    const RAJA::Span<ValueType*, IndexType> subspan = span.slice(begin, count);
+
+    ASSERT_EQ(count, subspan.size());
+    ASSERT_EQ(ptr+begin, subspan.data());
+  }
+
+  delete[] ptr;
+}
+
+template <typename ValueType, typename IndexType>
+void testSpanMakeSpanTypes()
+{
+  constexpr IndexType len = 4;
+  ValueType* ptr = new ValueType[len];
+
+  {
+    const RAJA::Span<ValueType*, IndexType> span = RAJA::make_span(ptr, len);
+
+    ASSERT_EQ(ptr, span.data());
+    ASSERT_EQ(len, span.size());
+  }
+
+  delete[] ptr;
+}
diff --git a/test/unit/test-timer.cpp b/test/unit/util/test-timer.cpp
similarity index 62%
rename from test/unit/test-timer.cpp
rename to test/unit/util/test-timer.cpp
index 27ce9d582c..a3bea84afd 100644
--- a/test/unit/test-timer.cpp
+++ b/test/unit/util/test-timer.cpp
@@ -6,10 +6,10 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for basic timer operation
+/// Source file containing unit tests for Timer class
 ///
 
-#include "gtest/gtest.h"
+#include "RAJA_test-base.hpp"
 
 #include "RAJA/util/Timer.hpp"
 
@@ -21,7 +21,7 @@
 #include <thread>
 
 
-TEST(TimerTest, No1)
+TEST(TimerUnitTest, No1)
 {
   auto timer = RAJA::Timer();
 
@@ -45,7 +45,7 @@ TEST(TimerTest, No1)
 }
 
 
-TEST(TimerTest, No2)
+TEST(TimerUnitTest, No2)
 {
   RAJA::Timer timer;
 
@@ -63,3 +63,33 @@ TEST(TimerTest, No2)
   EXPECT_GT(elapsed, 0.02);
   EXPECT_LT(elapsed, 0.05);
 }
+
+
+TEST(TimerUnitTest, No3)
+{
+  RAJA::Timer timer;
+
+  timer.start("test_timer");
+
+  for (int i = 2; i > 0; --i) {
+    std::cout << i << std::endl;
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  }
+
+  timer.stop();
+
+  RAJA::Timer::ElapsedType elapsed = timer.elapsed();
+
+  EXPECT_GT(elapsed, 0.02);
+  EXPECT_LT(elapsed, 0.05);
+
+  timer.reset();
+  elapsed = timer.elapsed();
+  ASSERT_EQ(0, elapsed);
+
+  timer.start();
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  timer.stop();
+  elapsed = timer.elapsed();
+  EXPECT_GT(elapsed, 0.01); 
+}
diff --git a/test/unit/view-layout/CMakeLists.txt b/test/unit/view-layout/CMakeLists.txt
new file mode 100644
index 0000000000..5a03214315
--- /dev/null
+++ b/test/unit/view-layout/CMakeLists.txt
@@ -0,0 +1,26 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+raja_add_test(
+  NAME test-standard-layout
+  SOURCES test-standard-layout.cpp)
+
+raja_add_test(
+  NAME test-typedlayout
+  SOURCES test-typedlayout.cpp)
+
+raja_add_test(
+  NAME test-typedview
+  SOURCES test-typedview.cpp)
+
+raja_add_test(
+  NAME test-makelayout
+  SOURCES test-makelayout.cpp)
+
+raja_add_test(
+  NAME test-multiview
+  SOURCES test-multiview.cpp)
diff --git a/test/unit/test-layout.cpp b/test/unit/view-layout/test-makelayout.cpp
similarity index 55%
rename from test/unit/test-layout.cpp
rename to test/unit/view-layout/test-makelayout.cpp
index b6d5f2dace..1c686fb266 100644
--- a/test/unit/test-layout.cpp
+++ b/test/unit/view-layout/test-makelayout.cpp
@@ -5,65 +5,9 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-///
-/// Source file containing tests for basic layout operations
-///
+#include "RAJA_test-base.hpp"
 
-#include "RAJA/RAJA.hpp"
-#include "gtest/gtest.h"
-
-RAJA_INDEX_VALUE(TestIndex1D, "TestIndex1D");
-
-RAJA_INDEX_VALUE(TIX, "TIX");
-RAJA_INDEX_VALUE(TIY, "TIY");
-RAJA_INDEX_VALUE(TIL, "TIL");
-
-TEST(OffsetLayoutTest, 1D)
-{
-  using layout = RAJA::OffsetLayout<1>;
-
-  /*
-   * Construct a 1D view with  with the following indices:
-   *
-   * 10, 11, 12, 13, 14
-   */
-  const layout l({{10}}, {{14}});
-
-  /*
-   * First element, 10, should have index 0.
-   */
-  ASSERT_EQ(0, l(10));
-
-  ASSERT_EQ(2, l(12));
-
-  /*
-   * Last element, 14, should have index 5.
-   */
-  ASSERT_EQ(4, l(14));
-}
-
-TEST(TypedLayoutTest, 1D)
-{
-  /*
-   * Construct a 2D view, 10x5
-   */
-  const RAJA::TypedLayout<TIL, RAJA::tuple<TIX, TIY>> l(10, 5);
-
-  ASSERT_EQ(TIL{0}, l(TIX{0}, TIY{0}));
-
-  ASSERT_EQ(TIL{2}, l(TIX{0}, TIY{2}));
-
-  ASSERT_EQ(TIL{10}, l(TIX{2}, TIY{0}));
-
-  TIX x{5};
-  TIY y{0};
-  l.toIndices(TIL{10}, x, y);
-  ASSERT_EQ(x, TIX{2});
-  ASSERT_EQ(y, TIY{0});
-}
-
-
-TEST(LayoutTest, OffsetVsRegular)
+TEST(LayoutUnitTest, OffsetVsRegular)
 {
   const auto layout =
       RAJA::make_permuted_layout({{6, 6}},
@@ -84,7 +28,7 @@ TEST(LayoutTest, OffsetVsRegular)
   }
 }
 
-TEST(OffsetLayoutTest, 2D_IJ)
+TEST(OffsetLayoutUnitTest, 2D_IJ)
 {
   /*
    * Construct a 2D layout:
@@ -111,7 +55,8 @@ TEST(OffsetLayoutTest, 2D_IJ)
   ASSERT_EQ(8, layout(1, 0));
 }
 
-TEST(OffsetLayoutTest, 2D_JI)
+
+TEST(OffsetLayoutUnitTest, 2D_JI)
 {
   using my_layout = RAJA::OffsetLayout<2>;
 
@@ -141,173 +86,7 @@ TEST(OffsetLayoutTest, 2D_JI)
   ASSERT_EQ(8, layout(1, 0));
 }
 
-TEST(OffsetLayoutTest, View)
-{
-  int* data = new int[10];
-
-  using layout = RAJA::OffsetLayout<>;
-
-  /*
-   * View is constructed by passing in the layout.
-   */
-  std::array<RAJA::Index_type, 1> lower{{1}};
-  std::array<RAJA::Index_type, 1> upper{{10}};
-  RAJA::View<int, layout> view(data, RAJA::make_offset_layout<1>(lower, upper));
-
-  for (int i = 0; i < 10; i++) {
-    data[i] = i;
-  }
-
-  ASSERT_EQ(data[0], view(1));
-  ASSERT_EQ(data[9], view(10));
-
-  delete[] data;
-}
-
-
-TEST(LayoutTest, 2D_IJ)
-{
-  using my_layout = RAJA::Layout<2>;
-
-  /*
-   * Construct a 2D layout:
-   *
-   * I is stride 5
-   * J is stride 1
-   *
-   * Linear indices range from [0, 15)
-   *
-   */
-
-  // Construct using variadic "sizes" ctor
-  const my_layout layout_a(3, 5);
-
-  // Construct using copy ctor
-  const my_layout layout_b(layout_a);
-
-  // Test default ctor and assignment operator
-  my_layout layout;
-  layout = layout_b;
-
-
-  ASSERT_EQ(0, layout(0, 0));
-
-  ASSERT_EQ(5, layout(1, 0));
-  ASSERT_EQ(15, layout(3, 0));
-
-  ASSERT_EQ(1, layout(0, 1));
-  ASSERT_EQ(5, layout(0, 5));
-
-  // Check that we get the identity (mod 15)
-  for (int k = 0; k < 20; ++k) {
-
-    // inverse map
-    int i, j;
-    layout.toIndices(k, i, j);
-
-    // forward map
-    int k2 = layout(i, j);
-
-    // check ident
-    ASSERT_EQ(k % 15, k2);
-
-    // check with a and b
-    ASSERT_EQ(k2, layout_a(i, j));
-    ASSERT_EQ(k2, layout_b(i, j));
-  }
-}
-
-TEST(LayoutTest, 2D_JI)
-{
-  using my_layout = RAJA::Layout<2>;
-
-  /*
-   * Construct a 2D layout:
-   *
-   * I is stride 1
-   * J is stride 3
-   *
-   * Linear indices range from [0, 15)
-   *
-   */
-  const my_layout layout =
-      RAJA::make_permuted_layout({{3, 5}},
-                                 RAJA::as_array<RAJA::PERM_JI>::get());
-
-  ASSERT_EQ(0, layout(0, 0));
-
-  ASSERT_EQ(1, layout(1, 0));
-  ASSERT_EQ(3, layout(3, 0));
-
-  ASSERT_EQ(3, layout(0, 1));
-  ASSERT_EQ(15, layout(0, 5));
-
-  // Check that we get the identity (mod 15)
-  for (int k = 0; k < 20; ++k) {
-
-    // inverse map
-    int i, j;
-    layout.toIndices(k, i, j);
-
-    // forward map
-    int k2 = layout(i, j);
-
-    ASSERT_EQ(k % 15, k2);
-  }
-}
-
-
-TEST(LayoutTest, 2D_IJ_ProjJ)
-{
-  using my_layout = RAJA::Layout<2>;
-
-  /*
-   * Construct a 2D projective layout:
-   *
-   * I is stride 1
-   * J is stride 0  -  projected out
-   *
-   * Linear indices range from [0, 7)
-   *
-   * values of J should have no effect on linear index
-   *
-   * All linear indices should produce J=0
-   *
-   */
-
-  // Construct using variadic "sizes" ctor
-  // Zero for J size should correctly produce projective layout
-  const my_layout layout(7, 0);
-
-  ASSERT_EQ(0, layout(0, 0));
-
-  ASSERT_EQ(1, layout(1, 0));
-  ASSERT_EQ(3, layout(3, 0));
-
-  // J should be projected out
-  ASSERT_EQ(0, layout(0, 1));
-  ASSERT_EQ(0, layout(0, 5));
-
-  // Check that we get the identity (mod 7)
-  for (int k = 0; k < 20; ++k) {
-
-    // inverse map
-    int i, j;
-    layout.toIndices(k, i, j);
-
-    // forward map
-    int k2 = layout(i, j);
-
-    // check ident
-    ASSERT_EQ(k % 7, k2);
-
-    // check projection of j
-    ASSERT_EQ(j, 0);
-  }
-}
-
-
-TEST(LayoutTest, 3D_KJI_ProjJ)
+TEST(LayoutUnitTest, 3D_KJI_ProjJ)
 {
   using my_layout = RAJA::Layout<3>;
 
@@ -335,7 +114,7 @@ TEST(LayoutTest, 3D_KJI_ProjJ)
   ASSERT_EQ(0, layout(0, 0, 0));
 
   ASSERT_EQ(1, layout(1, 0, 0));
-  ASSERT_EQ(3, layout(3, 0, 0));
+  ASSERT_EQ(2, layout(2, 0, 0));
 
   // J should be projected out
   ASSERT_EQ(0, layout(0, 1, 0));
@@ -362,8 +141,7 @@ TEST(LayoutTest, 3D_KJI_ProjJ)
   }
 }
 
-
-TEST(LayoutTest, 2D_StrideOne)
+TEST(LayoutUnitTest, 2D_StrideOne)
 {
   using my_layout = RAJA::Layout<2>;
   using my_layout_s1 = RAJA::Layout<2, ptrdiff_t, 0>; // first index is stride-1
@@ -390,14 +168,14 @@ TEST(LayoutTest, 2D_StrideOne)
 
   // Check that we get the same layout
   for (int i = 0; i < 3; ++i) {
-    for (int j = 0; j < 15; ++j) {
+    for (int j = 0; j < 5; ++j) {
 
       ASSERT_EQ(layout(i, j), layout_s1(i, j));
     }
   }
 }
 
-TEST(LayoutTest, 2D_StaticLayout)
+TEST(StaticLayoutUnitTest, 2D_StaticLayout)
 {
   RAJA::Layout<2> dynamic_layout(7, 5);
   using static_layout = RAJA::StaticLayout<RAJA::PERM_IJ,7,5>;
@@ -411,7 +189,7 @@ TEST(LayoutTest, 2D_StaticLayout)
   }
 }
 
-TEST(LayoutTest, 2D_PermutedStaticLayout)
+TEST(StaticLayoutUnitTest, 2D_PermutedStaticLayout)
 {
   auto dynamic_layout = 
     RAJA::make_permuted_layout({{7, 5}},
@@ -426,7 +204,7 @@ TEST(LayoutTest, 2D_PermutedStaticLayout)
   }
 }
 
-TEST(LayoutTest, 3D_PermutedStaticLayout)
+TEST(StaticLayoutUnitTest, 3D_PermutedStaticLayout)
 {
   auto dynamic_layout = 
     RAJA::make_permuted_layout({{7, 13, 5}},
@@ -444,7 +222,7 @@ TEST(LayoutTest, 3D_PermutedStaticLayout)
 }
 
 
-TEST(LayoutTest, 4D_PermutedStaticLayout)
+TEST(StaticLayoutUnitTest, 4D_PermutedStaticLayout)
 {
   auto dynamic_layout = 
     RAJA::make_permuted_layout({{7, 13, 5, 17}},
@@ -463,3 +241,4 @@ TEST(LayoutTest, 4D_PermutedStaticLayout)
   }
 }
 
+
diff --git a/test/unit/view-layout/test-multiview.cpp b/test/unit/view-layout/test-multiview.cpp
new file mode 100644
index 0000000000..2fb03c9623
--- /dev/null
+++ b/test/unit/view-layout/test-multiview.cpp
@@ -0,0 +1,297 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "RAJA_test-base.hpp"
+#include "RAJA_unit-test-types.hpp"
+
+RAJA_INDEX_VALUE(TX, "TX");
+// ISSUE: https://github.com/LLNL/RAJA/issues/881
+//RAJA_INDEX_VALUE(TIX, "TIX");
+//RAJA_INDEX_VALUE(TIL, "TIL");
+
+template<typename T>
+class MultiViewUnitTest : public ::testing::Test {};
+
+template<typename T>
+class OffsetLayoutMultiViewUnitTest : public ::testing::Test {};
+
+template<typename T>
+class TypedIntegralMultiViewUnitTest : public ::testing::Test {};
+
+TYPED_TEST_SUITE(MultiViewUnitTest, UnitIntFloatTypes);
+TYPED_TEST_SUITE(OffsetLayoutMultiViewUnitTest, UnitIntFloatTypes);
+TYPED_TEST_SUITE(TypedIntegralMultiViewUnitTest, UnitIntFloatTypes);
+
+TYPED_TEST(MultiViewUnitTest, Constructors)
+{
+
+  using layout = RAJA::Layout<1>;
+
+  TypeParam   a1[10];
+  TypeParam   a2[10];
+  TypeParam * data[2];
+
+  data[0] = a1;
+  data[1] = a2;
+
+  RAJA::MultiView<TypeParam, layout> view(data, layout(10));
+
+  /*
+   * Should be able to construct a non-const MultiView from a non-const MultiView
+   */
+  RAJA::MultiView<TypeParam, layout> view2(view);
+
+  /*
+   * Should be able to construct a const MultiView from a non-const MultiView
+   */
+  RAJA::MultiView<TypeParam const, layout> const_view(view);
+
+  /*
+   * Should be able to construct a const MultiView from a const MultiView
+   */
+  RAJA::MultiView<TypeParam const, layout> const_view2(const_view);
+
+
+  // non-default construction of MultiView with array-of-pointers index moved to 1st position
+  RAJA::MultiView<TypeParam, layout, 1> view1p(data, layout(10));
+
+  // construct a non-const MultiView from a non-const MultiView
+  RAJA::MultiView<TypeParam, layout, 1> view1p2(view1p);
+
+  // construct a const MultiView from a non-const MultiView
+  RAJA::MultiView<TypeParam const, layout, 1> const_view1p(view1p);
+
+  // construct a const MultiView from a const MultiView
+  RAJA::MultiView<TypeParam const, layout, 1> const_view1p2(const_view1p);
+}
+
+TYPED_TEST(MultiViewUnitTest, Accessor)
+{
+
+  const int Nx = 3;
+  const int Ny = 5;
+  const int Nz = 2;
+  const int N  = Nx*Ny*Nz;
+  TypeParam *b = new TypeParam[N];
+  TypeParam *c = new TypeParam[N];
+  TypeParam *a[2];
+
+  a[0] = b;
+  a[1] = c;
+
+  int iter{0};
+  for(TypeParam i=0; i<TypeParam{N}; ++i)
+  {
+    a[0][iter] = TypeParam{i};
+    a[1][iter] = TypeParam{i}+1;
+    ++iter;
+  }
+
+  /*
+   * 1D Accessor
+   */
+  RAJA::MultiView<TypeParam, RAJA::Layout<1>> view_1D(a,N);
+  RAJA::MultiView<TypeParam, RAJA::Layout<1>,1> view_1D1p(a,N);
+  TypeParam val{0};
+  for(int i=0; i<N; ++i) {
+    ASSERT_EQ(val, view_1D(0,i));
+    ASSERT_EQ(val+1, view_1D(1,i));
+    ASSERT_EQ(val, view_1D1p(i,0));
+    ASSERT_EQ(val+1, view_1D1p(i,1));
+    val++;
+  }
+
+  /*
+   * 2D Accessor
+   */
+  RAJA::MultiView<TypeParam, RAJA::Layout<2>> view_2D(a,Ny,Nx);
+  RAJA::MultiView<TypeParam, RAJA::Layout<2>,1> view_2D1p(a,Ny,Nx);
+  val = TypeParam{0};
+  for(int j=0; j<Ny; ++j) {
+    for(int i=0; i<Nx; ++i) {
+      ASSERT_EQ(val, view_2D(0,j,i));
+      ASSERT_EQ(val+1, view_2D(1,j,i));
+      ASSERT_EQ(val, view_2D1p(j,0,i));
+      ASSERT_EQ(val+1, view_2D1p(j,1,i));
+      val++;
+    }
+  }
+
+  /*
+   * 3D Accessor
+   */
+  RAJA::MultiView<TypeParam, RAJA::Layout<3>> view_3D(a,Nz,Ny,Nx);
+  RAJA::MultiView<TypeParam, RAJA::Layout<3>,2> view_3D1p(a,Nz,Ny,Nx);
+  val = TypeParam{0};
+  for(int k=0; k<Nz; ++k) {
+    for(int j=0; j<Ny; ++j) {
+      for(int i=0; i<Nx; ++i) {
+        ASSERT_EQ(val, view_3D(0,k,j,i));
+        ASSERT_EQ(val+1, view_3D(1,k,j,i));
+        ASSERT_EQ(val, view_3D1p(k,j,0,i));
+        ASSERT_EQ(val+1, view_3D1p(k,j,1,i));
+        val++;
+      }
+    }
+  }
+
+  delete[] b;
+  delete[] c;
+}
+
+TYPED_TEST(OffsetLayoutMultiViewUnitTest, View)
+{
+  TypeParam* d1 = new TypeParam[10];
+  TypeParam* d2 = new TypeParam[10];
+  TypeParam* data[2];
+
+  data[0] = d1;
+  data[1] = d2;
+
+  using layout = RAJA::OffsetLayout<>;
+
+  /*
+   * MultiView is constructed by passing in the layout.
+   */
+  std::array<RAJA::Index_type, 1> lower{{1}};
+  std::array<RAJA::Index_type, 1> upper{{10}};
+  RAJA::MultiView<TypeParam, layout> view(data, RAJA::make_offset_layout<1>(lower, upper));
+  RAJA::MultiView<TypeParam, layout,1> view1p(data, RAJA::make_offset_layout<1>(lower, upper));
+
+  for (int i = 0; i < 10; i++) {
+    data[0][i] = static_cast<TypeParam>(i);
+    data[1][i] = static_cast<TypeParam>(i+1);
+  }
+
+  ASSERT_EQ(data[0][0], view(0,1));
+  ASSERT_EQ(data[1][9], view(1,10));
+  ASSERT_EQ(data[0][0], view1p(1,0));
+  ASSERT_EQ(data[1][9], view1p(10,1));
+
+  delete[] d1;
+  delete[] d2;
+}
+
+TYPED_TEST(MultiViewUnitTest, Shift1D)
+{
+
+  int N = 10;
+  TypeParam *reala = new TypeParam[N];
+  TypeParam *realb = new TypeParam[N];
+  TypeParam *a[2];
+  a[0] = reala;
+  a[1] = realb;
+
+  //Create a view from a base view
+  const int DIM = 1;
+  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}},{{N-1}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
+  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a,N);
+
+  for(int i=0; i<N; ++i) {
+    A(0,i) = static_cast<TypeParam>(i + 1);
+    B(1,i) = static_cast<TypeParam>(i + 1);
+  }
+
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N}});
+
+  for(int i=N; i<2*N; ++i)
+  {
+    ASSERT_EQ(Ashift(0,i),A(0,i-N));
+    ASSERT_EQ(Bshift(1,i),B(1,i-N));
+  }
+
+  // offset layout with MultiView with array-of-pointers index in 1st position
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> C(a,layout);
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> Cshift = C.shift({{N}});
+
+  for(int i=N; i<2*N; ++i)
+  {
+    ASSERT_EQ(Cshift(i,0),C(i-N,0));
+    ASSERT_EQ(Cshift(i,1),C(i-N,1));
+    ASSERT_EQ(Ashift(0,i),C(i-N,0));
+    ASSERT_EQ(Cshift(i,0),A(0,i-N));
+  }
+
+
+  // ISSUE: https://github.com/LLNL/RAJA/issues/881
+  //Create a shifted view from a view with a typed layout
+  //using TLayout = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
+  //using TOffsetLayout = RAJA::TypedOffsetLayout<TIL, RAJA::tuple<TIX>>;
+
+  //TLayout myLayout(10);
+
+  //RAJA::MultiView<TypeParam, TLayout> D(a, myLayout);
+  //RAJA::MultiView<TypeParam, TOffsetLayout> Dshift = D.shift({{N}});
+
+  //for(TIX i=TIX{N}; i<TIX{2*N}; ++i)
+  //{
+  //  ASSERT_EQ(Dshift(0,i),D(0,i-N));
+  //};
+
+  delete[] reala;
+  delete[] realb;
+}
+
+TYPED_TEST(MultiViewUnitTest, Shift2D)
+{
+
+  int N = 10;
+  TypeParam *a0 = new TypeParam[N*N];
+  TypeParam *b0 = new TypeParam[N*N];
+  TypeParam *a[2];
+  a[0] = a0;
+  a[1] = b0;
+
+  const int DIM = 2;
+  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0,0}},{{N-1,N-1}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
+  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a,N,N);
+
+  for(int y=0; y<N; ++y) {
+    for(int x=0; x<N; ++x) {
+      A(0,y,x) = static_cast<TypeParam>(x + N*y);
+      B(1,y,x) = static_cast<TypeParam>(x + N*y + 1);
+    }
+  }
+
+  //Create a view from a base view with an offsetlayout
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N,N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N,N}});
+
+  for(int y=N; y<N+N; ++y) {
+    for(int x=N; x<N+N; ++x) {
+      ASSERT_EQ(Ashift(0,y,x),A(0,y-N,x-N));
+      ASSERT_EQ(Bshift(1,y,x),B(1,y-N,x-N));
+    }
+  }
+
+  //Create a view from a base view with permuted layout
+  std::array< RAJA::idx_t, 2> perm {{1, 0}};
+  RAJA::OffsetLayout<2> playout =
+    RAJA::make_permuted_offset_layout<2>( {{0, 0}}, {{N-1, N-1}}, perm );
+
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> C(a, playout);
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Cshift = C.shift({{N,N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>,1> D(a, playout);
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>,1> Dshift1p = D.shift({{N,N}});
+
+  for(int y=N; y<N+N; ++y) {
+    for(int x=N; x<N+N; ++x) {
+      ASSERT_EQ(Cshift(0,y,x),C(0,y-N,x-N));
+      ASSERT_EQ(Cshift(1,y,x),C(1,y-N,x-N));
+      ASSERT_EQ(Dshift1p(y,0,x),D(y-N,0,x-N));
+      ASSERT_EQ(Dshift1p(y,1,x),D(y-N,1,x-N));
+      ASSERT_EQ(Dshift1p(y,1,x),C(1,y-N,x-N));
+      ASSERT_EQ(Cshift(0,y,x),D(y-N,0,x-N));
+    }
+  }
+
+  delete[] a0;
+  delete[] b0;
+}
diff --git a/test/unit/view-layout/test-standard-layout.cpp b/test/unit/view-layout/test-standard-layout.cpp
new file mode 100644
index 0000000000..5e23ef1bb4
--- /dev/null
+++ b/test/unit/view-layout/test-standard-layout.cpp
@@ -0,0 +1,177 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "RAJA_test-base.hpp"
+
+/*
+ * Basic layout test
+ */
+
+TEST(OffsetLayoutUnitTest, Constructors)
+{
+  using layout = RAJA::OffsetLayout<1>;
+
+  /*
+   * Construct a 1D view with  with the following indices:
+   *
+   * 10, 11, 12, 13, 14
+   */
+  const layout l({{10}}, {{14}});
+
+  /*
+   * First element, 10, should have index 0.
+   */
+  ASSERT_EQ(0, l(10));
+
+  ASSERT_EQ(2, l(12));
+
+  /*
+   * Last element, 14, should have index 5.
+   */
+  ASSERT_EQ(4, l(14));
+}
+
+TEST(LayoutUnitTest, 2D_IJ)
+{
+  using my_layout = RAJA::Layout<2>;
+
+  /*
+   * Construct a 2D layout:
+   *
+   * I is stride 5
+   * J is stride 1
+   *
+   * Linear indices range from [0, 15)
+   *
+   */
+
+  // Construct using variadic "sizes" ctor
+  const my_layout layout_a(3, 5);
+
+  // Construct using copy ctor
+  const my_layout layout_b(layout_a);
+
+  // Test default ctor and assignment operator
+  my_layout layout;
+  layout = layout_b;
+
+
+  ASSERT_EQ(0, layout(0, 0));
+
+  ASSERT_EQ(5, layout(1, 0));
+  ASSERT_EQ(14, layout(2, 4));
+
+  ASSERT_EQ(1, layout(0, 1));
+  ASSERT_EQ(4, layout(0, 4));
+
+  // Check that we get the identity
+  for (int k = 0; k < 15; ++k) {
+
+    // inverse map
+    int i, j;
+    layout.toIndices(k, i, j);
+
+    // forward map
+    int k2 = layout(i, j);
+
+    // check ident
+    ASSERT_EQ(k % 15, k2);
+
+    // check with a and b
+    ASSERT_EQ(k2, layout_a(i, j));
+    ASSERT_EQ(k2, layout_b(i, j));
+  }
+}
+
+TEST(LayoutUnitTest, 2D_JI)
+{
+  using my_layout = RAJA::Layout<2>;
+
+  /*
+   * Construct a 2D layout:
+   *
+   * I is stride 1
+   * J is stride 3
+   *
+   * Linear indices range from [0, 15)
+   *
+   */
+  const my_layout layout =
+      RAJA::make_permuted_layout({{3, 5}},
+                                 RAJA::as_array<RAJA::PERM_JI>::get());
+
+  ASSERT_EQ(0, layout(0, 0));
+
+  ASSERT_EQ(1, layout(1, 0));
+  ASSERT_EQ(2, layout(2, 0));
+
+  ASSERT_EQ(3, layout(0, 1));
+  ASSERT_EQ(14, layout(2, 4));
+
+  // Check that we get the identity (mod 15)
+  for (int k = 0; k < 15; ++k) {
+
+    // inverse map
+    int i, j;
+    layout.toIndices(k, i, j);
+
+    // forward map
+    int k2 = layout(i, j);
+
+    ASSERT_EQ(k % 15, k2);
+  }
+}
+
+TEST(LayoutUnitTest, 2D_IJ_ProjJ)
+{
+  using my_layout = RAJA::Layout<2>;
+
+  /*
+   * Construct a 2D projective layout:
+   *
+   * I is stride 1
+   * J is stride 0  -  projected out
+   *
+   * Linear indices range from [0, 7)
+   *
+   * values of J should have no effect on linear index
+   *
+   * All linear indices should produce J=0
+   *
+   */
+
+  // Construct using variadic "sizes" ctor
+  // Zero for J size should correctly produce projective layout
+  const my_layout layout(7, 0);
+
+  ASSERT_EQ(0, layout(0, 0));
+
+  ASSERT_EQ(1, layout(1, 0));
+  ASSERT_EQ(3, layout(3, 0));
+
+  // J should be projected out
+  ASSERT_EQ(0, layout(0, 1));
+  ASSERT_EQ(0, layout(0, 5));
+
+  // Check that we get the identity (mod 7)
+  for (int k = 0; k < 20; ++k) {
+
+    // inverse map
+    int i, j;
+    layout.toIndices(k, i, j);
+
+    // forward map
+    int k2 = layout(i, j);
+
+    // check ident
+    ASSERT_EQ(k % 7, k2);
+
+    // check projection of j
+    ASSERT_EQ(j, 0);
+  }
+}
+
diff --git a/test/unit/view-layout/test-typedlayout.cpp b/test/unit/view-layout/test-typedlayout.cpp
new file mode 100644
index 0000000000..f035ec20b6
--- /dev/null
+++ b/test/unit/view-layout/test-typedlayout.cpp
@@ -0,0 +1,212 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "RAJA_test-base.hpp"
+#include "RAJA_unit-test-types.hpp"
+
+template<typename T>
+class TypedLayoutUnitTest : public ::testing::Test {};
+
+TYPED_TEST_SUITE(TypedLayoutUnitTest, UnitIndexTypes);
+
+
+TYPED_TEST(TypedLayoutUnitTest, TypedLayoutConstructors)
+{
+
+  const RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>> l(10,5);
+
+  ASSERT_EQ(TypeParam{0}, l(TypeParam{0}, TypeParam{0}));
+
+  ASSERT_EQ(TypeParam{2}, l(TypeParam{0}, TypeParam{2}));
+
+  ASSERT_EQ(TypeParam{10}, l(TypeParam{2}, TypeParam{0}));
+
+  TypeParam x{5};
+  TypeParam y{0};
+  l.toIndices(TypeParam{10}, y, x);
+  ASSERT_EQ(x, TypeParam{0});
+  ASSERT_EQ(y, TypeParam{2});
+}
+
+TYPED_TEST(TypedLayoutUnitTest, 2D_accessor)
+{
+  using my_layout = RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>>;
+
+  /*
+   * Construct a 2D layout:
+   *
+   * I is stride 5
+   * J is stride 1
+   *
+   * Linear indices range from [0, 15)
+   *
+   */
+
+  // Construct using variadic "sizes" ctor
+  const my_layout layout_a(3, 5);
+
+  // Construct using copy ctor
+  const my_layout layout_b(layout_a);
+
+  // Test default ctor and assignment operator
+  my_layout layout;
+  layout = layout_b;
+
+
+  ASSERT_EQ(TypeParam(0), layout(0, 0));
+
+  ASSERT_EQ(TypeParam(5), layout(1, 0));
+  ASSERT_EQ(TypeParam(14), layout(2, 4));
+
+  ASSERT_EQ(TypeParam(1), layout(0, 1));
+  ASSERT_EQ(TypeParam(4), layout(0, 4));
+
+  // Check that we get the identity
+  for (int k = 0; k < 15; ++k) {
+
+    // inverse map
+    TypeParam i, j;
+    layout.toIndices(k, i, j);
+
+    // forward map
+    TypeParam k2 = layout(i, j);
+
+    // check ident
+    ASSERT_EQ(k, k2);
+
+    // check with a and b
+    ASSERT_EQ(k2, layout_a(i, j));
+    ASSERT_EQ(k2, layout_b(i, j));
+  }
+
+}
+
+TYPED_TEST(TypedLayoutUnitTest, 2D_IJ_ProjJ)
+{
+  using my_layout = RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>>;
+
+  /*
+   * Construct a 2D projective layout:
+   *
+   * I is stride 1
+   * J is stride 0  -  projected out
+   *
+   * Linear indices range from [0, 7)
+   *
+   * values of J should have no effect on linear index
+   *
+   * All linear indices should produce J=0
+   *
+   */
+
+  // Construct using variadic "sizes" ctor
+  // Zero for J size should correctly produce projective layout
+  const my_layout layout(7, 0);
+
+  ASSERT_EQ(TypeParam(0), layout(0, 0));
+
+  ASSERT_EQ(TypeParam(1), layout(1, 0));
+  ASSERT_EQ(TypeParam(3), layout(3, 0));
+
+  // J should be projected out
+  ASSERT_EQ(TypeParam(0), layout(0, 1));
+  ASSERT_EQ(TypeParam(0), layout(0, 5));
+
+  TypeParam pK = 0;
+  // Check that we get the identity (mod 7)
+  for (int k = 0; k < 20; ++k) {
+
+    // inverse map
+    TypeParam i, j;
+    layout.toIndices(pK, i, j);
+
+    // forward map
+    TypeParam k2 = layout(i, j);
+
+    // check ident
+    ASSERT_EQ(pK % 7, k2);
+
+    // check projection of j
+    ASSERT_EQ(j, TypeParam(0));
+    pK++;
+  }
+}
+
+TYPED_TEST(TypedLayoutUnitTest, 2D_StaticLayout)
+{
+  RAJA::Layout<2, TypeParam> dynamic_layout(7, 5);
+  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_IJ,TypeParam,RAJA::list<TypeParam,TypeParam>,7,5>;
+
+  // Check that we get the same layout
+  for (TypeParam i = 0; i < 7; ++i) {
+    for (TypeParam j = 0; j < 5; ++j) {
+
+      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i,j));
+    }
+  }
+
+}
+
+TYPED_TEST(TypedLayoutUnitTest, 2D_PermutedStaticLayout)
+{
+  auto dynamic_layout =
+    RAJA::make_permuted_layout({{7, 5}},
+                               RAJA::as_array<RAJA::PERM_JI>::get());
+  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_JI,
+                                                TypeParam,
+                                                RAJA::list<TypeParam,TypeParam>, 7,5>;
+
+  // Check that we get the same layout
+  for (TypeParam i = 0; i < 7; ++i) {
+    for (TypeParam j = 0; j < 5; ++j) {
+      ASSERT_EQ(TypeParam(dynamic_layout(i, j)), static_layout::s_oper(i,j));
+    }
+  }
+}
+
+TYPED_TEST(TypedLayoutUnitTest, 3D_PermutedStaticLayout)
+{
+  auto dynamic_layout =
+    RAJA::make_permuted_layout({{7, 13, 5}},
+                               RAJA::as_array<RAJA::PERM_JKI>::get());
+  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_JKI,
+                                                TypeParam,
+                                                RAJA::list<TypeParam,TypeParam,TypeParam>,
+                                                7,13,5>;
+
+  // Check that we get the same layout
+  for (TypeParam i = 0; i < 7; ++i) {
+    for (TypeParam j = 0; j < 9; ++j) {
+      for (TypeParam k = 0; k < 5; ++k) {
+        ASSERT_EQ(TypeParam(dynamic_layout(i, j, k)), static_layout::s_oper(i,j,k));
+      }
+    }
+  }
+}
+
+
+TYPED_TEST(TypedLayoutUnitTest, 4D_PermutedStaticLayout)
+{
+  auto dynamic_layout =
+    RAJA::make_permuted_layout({{7, 13, 5, 17}},
+                               RAJA::as_array<RAJA::PERM_LJKI>::get());
+  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_LJKI,
+                                                TypeParam,
+                                                RAJA::list<TypeParam,TypeParam,TypeParam,TypeParam>,
+                                                7,13,5,17>;
+
+  // Check that we get the same layout
+  for (TypeParam i = 0; i < 7; ++i) {
+    for (TypeParam j = 0; j < 8; ++j) {
+      for (TypeParam k = 0; k < 5; ++k) {
+        for (TypeParam l = 0; l < 5; ++l) {
+          ASSERT_EQ(TypeParam(dynamic_layout(i, j, k, l)), static_layout::s_oper(i,j,k,l));
+        }
+      }
+    }
+  }
+}
diff --git a/test/unit/view-layout/test-typedview.cpp b/test/unit/view-layout/test-typedview.cpp
new file mode 100644
index 0000000000..fc6b3adf4d
--- /dev/null
+++ b/test/unit/view-layout/test-typedview.cpp
@@ -0,0 +1,295 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "RAJA_test-base.hpp"
+#include "RAJA_unit-test-types.hpp"
+
+RAJA_INDEX_VALUE(TX, "TX");
+RAJA_INDEX_VALUE(TIX, "TIX");
+RAJA_INDEX_VALUE(TIY, "TIY");
+RAJA_INDEX_VALUE(TIL, "TIL");
+
+template<typename T>
+class TypedViewUnitTest : public ::testing::Test {};
+
+template<typename T>
+class OffsetLayoutViewUnitTest : public ::testing::Test {};
+
+template<typename T>
+class TypedIntegralViewUnitTest : public ::testing::Test {};
+
+TYPED_TEST_SUITE(TypedViewUnitTest, UnitIntFloatTypes);
+TYPED_TEST_SUITE(OffsetLayoutViewUnitTest, UnitIntFloatTypes);
+TYPED_TEST_SUITE(TypedIntegralViewUnitTest, UnitIntFloatTypes);
+
+TYPED_TEST(TypedViewUnitTest, Constructors)
+{
+
+  using layout = RAJA::Layout<1>;
+
+  TypeParam data[10];
+  RAJA::View<TypeParam, layout> view(data, layout(10));
+
+  /*
+   * Should be able to construct a non-const View from a non-const View
+   */
+  RAJA::View<TypeParam, layout> view2(view);
+
+  /*
+   * Should be able to construct a const View from a non-const View
+   */
+  RAJA::View<TypeParam const, layout> const_view(view);
+
+  /*
+   * Should be able to construct a const View from a const View
+   */
+  RAJA::View<TypeParam const, layout> const_view2(const_view);
+}
+
+TYPED_TEST(TypedViewUnitTest, Accessor)
+{
+
+  const int Nx = 3;
+  const int Ny = 5;
+  const int Nz = 2;
+  const int N  = Nx*Ny*Nz;
+  TypeParam *a = new TypeParam[N];
+
+  int iter{0};
+  for(TypeParam i=0; i<TypeParam{N}; ++i)
+  {
+    a[iter] = TypeParam{i};
+    ++iter;
+  }
+
+  /*
+   * 1D Accessor
+   */
+  RAJA::View<TypeParam, RAJA::Layout<1>> view_1D(a,N);
+  TypeParam val{0};
+  for(int i=0; i<N; ++i) {
+    ASSERT_EQ(val, view_1D(i));
+    val++;
+  }
+
+  /*
+   * 2D Accessor
+   */
+  RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a,Ny,Nx);
+  val = TypeParam{0};
+  for(int j=0; j<Ny; ++j) {
+    for(int i=0; i<Nx; ++i) {
+      ASSERT_EQ(val, view_2D(j,i));
+      val++;
+    }
+  }
+
+  /*
+   * 3D Accessor
+   */
+  RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a,Nz,Ny,Nx);
+  val = TypeParam{0};
+  for(int k=0; k<Nz; ++k) {
+    for(int j=0; j<Ny; ++j) {
+      for(int i=0; i<Nx; ++i) {
+        ASSERT_EQ(val, view_3D(k,j,i));
+        val++;
+      }
+    }
+  }
+
+  delete[] a;
+}
+TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
+{
+
+  const int Nx = 3;
+  const int Ny = 5;
+  const int Nz = 2;
+  const int N  = Nx*Ny*Nz;
+  TypeParam *a = new TypeParam[N];
+
+  int iter{0};
+  for(TypeParam i=0; i<TypeParam{N}; ++i)
+  {
+    a[iter] = TypeParam{i};
+    ++iter;
+  }
+
+  /*
+   * 1D Typed Accessor
+   */
+  RAJA::TypedView<TypeParam, RAJA::Layout<1>, TypeParam> view_1D(a,N);
+  TypeParam val{0};
+  for(TypeParam i=0; i<N; ++i) {
+    ASSERT_EQ(val, view_1D(i));
+    val++;
+  }
+
+  /*
+   * 2D Typed Accessor
+   */
+  RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a,Ny,Nx);
+  val = TypeParam{0};
+  for(TypeParam j=0; j<Ny; ++j) {
+    for(TypeParam i=0; i<Nx; ++i) {
+      ASSERT_EQ(val, view_2D(j,i));
+      val++;
+    }
+  }
+
+  /*
+   * 3D Typed Accessor
+   */
+  RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a,Nz,Ny,Nx);
+  val = TypeParam{0};
+  for(TypeParam k=0; k<Nz; ++k) {
+    for(TypeParam j=0; j<Ny; ++j) {
+      for(TypeParam i=0; i<Nx; ++i) {
+        ASSERT_EQ(val, view_3D(k,j,i));
+        val++;
+      }
+    }
+  }
+
+  delete[] a;
+}
+
+TYPED_TEST(OffsetLayoutViewUnitTest, View)
+{
+  TypeParam* data = new TypeParam[10];
+
+  using layout = RAJA::OffsetLayout<>;
+
+  /*
+   * View is constructed by passing in the layout.
+   */
+  std::array<RAJA::Index_type, 1> lower{{1}};
+  std::array<RAJA::Index_type, 1> upper{{10}};
+  RAJA::View<TypeParam, layout> view(data, RAJA::make_offset_layout<1>(lower, upper));
+
+  for (int i = 0; i < 10; i++) {
+    data[i] = static_cast<TypeParam>(i);
+  }
+
+  ASSERT_EQ(data[0], view(1));
+  ASSERT_EQ(data[9], view(10));
+
+  delete[] data;
+}
+
+TYPED_TEST(TypedViewUnitTest, Shift1D)
+{
+
+  int N = 10;
+  TypeParam *a = new TypeParam[N];
+  TypeParam *b = new TypeParam[N];
+
+  /*
+   * Create a view from a base view
+   */
+  const int DIM = 1;
+  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}},{{N-1}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
+  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a,N);
+  RAJA::TypedView<TypeParam, RAJA::Layout<DIM>,TX> C(a,N);
+
+  for(int i=0; i<N; ++i) {
+    A(i) = static_cast<TypeParam>(i + 1);
+  }
+
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N}});
+
+  /*
+   * Create a view from a base view with an offsetlayout
+   */
+  RAJA::TypedView<TypeParam, RAJA::OffsetLayout<DIM>, TX> Cshift = C.shift({{N}});
+
+  for(int i=N; i<2*N; ++i)
+  {
+    ASSERT_EQ(Ashift(i),A(i-N));
+    ASSERT_EQ(Bshift(i),B(i-N));
+  }
+
+  for(TX tx=TX{N}; tx<TX{2*N}; tx++)
+  {
+    ASSERT_EQ(Cshift(tx),C(tx-N));
+  }
+
+  /*
+   * Create a shifted view from a view with a typed layout
+   */
+  using TLayout = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
+  using TOffsetLayout = RAJA::TypedOffsetLayout<TIL, RAJA::tuple<TIX>>;
+
+  TLayout myLayout(10);
+
+  RAJA::View<TypeParam, TLayout> D(a, myLayout);
+  RAJA::View<TypeParam, TOffsetLayout> Dshift = D.shift({{N}});
+
+  for(TIX i=TIX{N}; i<TIX{2*N}; ++i)
+  {
+    ASSERT_EQ(Dshift(i),D(i-N));
+  };
+
+  delete[] a;
+  delete[] b;
+
+}
+
+
+TYPED_TEST(TypedViewUnitTest, Shift2D)
+{
+
+  int N = 10;
+  TypeParam *a = new TypeParam[N*N];
+  TypeParam *b = new TypeParam[N*N];
+
+  const int DIM = 2;
+  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0,0}},{{N-1,N-1}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
+  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a,N,N);
+
+  for(int y=0; y<N; ++y) {
+    for(int x=0; x<N; ++x) {
+      A(y,x) = static_cast<TypeParam>(x + N*y);
+    }
+  }
+
+  /*
+   * Create a view from a base view with an offsetlayout
+   */
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N,N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N,N}});
+
+  for(int y=N; y<N+N; ++y) {
+    for(int x=N; x<N+N; ++x) {
+      ASSERT_EQ(Ashift(y,x),A(y-N,x-N));
+      ASSERT_EQ(Bshift(y,x),B(y-N,x-N));
+    }
+  }
+
+  /*
+   * Create a view from a base view with permuted layout
+   */
+  std::array< RAJA::idx_t, 2> perm {{1, 0}};
+  RAJA::OffsetLayout<2> playout =
+    RAJA::make_permuted_offset_layout<2>( {{0, 0}}, {{N-1, N-1}}, perm );
+
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> C(a, playout);
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Cshift = C.shift({{N,N}});
+
+  for(int y=N; y<N+N; ++y) {
+    for(int x=N; x<N+N; ++x) {
+      ASSERT_EQ(Cshift(y,x),C(y-N,x-N));
+    }
+  }
+
+  delete[] a;
+  delete[] b;
+}
diff --git a/test/unit/workgroup/CMakeLists.txt b/test/unit/workgroup/CMakeLists.txt
new file mode 100644
index 0000000000..8a6a2d53ac
--- /dev/null
+++ b/test/unit/workgroup/CMakeLists.txt
@@ -0,0 +1,85 @@
+###############################################################################
+# Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# macro that generates test file and build target for each
+# sub-test and backend
+# this must be a macro or the linker variable set by FindHIP won't be set in
+# the right scope and linking will fail with a weird error from
+# hipcc_cmake_linker_helper because it expects the path to hipcc as the first
+# argument
+#
+macro( buildunitworkgrouptest TESTNAME SUBTESTNAMES BACKENDS )
+  foreach( BACKEND ${BACKENDS} )
+    foreach( SUBTESTNAME ${SUBTESTNAMES} )
+      configure_file( test-workgroup-${TESTNAME}.cpp.in
+                      test-workgroup-${TESTNAME}-${SUBTESTNAME}-${BACKEND}.cpp )
+      raja_add_test( NAME test-workgroup-${TESTNAME}-${SUBTESTNAME}-${BACKEND}
+                     SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-workgroup-${TESTNAME}-${SUBTESTNAME}-${BACKEND}.cpp )
+
+      target_include_directories(test-workgroup-${TESTNAME}-${SUBTESTNAME}-${BACKEND}.exe
+                                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+    endforeach()
+  endforeach()
+endmacro()
+
+
+set(BACKENDS Sequential)
+set(Vtable_BACKENDS Sequential)
+set(WorkStorage_BACKENDS Sequential)
+
+if(RAJA_ENABLE_TBB)
+  list(APPEND BACKENDS TBB)
+  list(APPEND Vtable_BACKENDS TBB)
+endif()
+
+if(RAJA_ENABLE_OPENMP)
+  list(APPEND BACKENDS OpenMP)
+  list(APPEND Vtable_BACKENDS OpenMP)
+endif()
+
+if(RAJA_ENABLE_TARGET_OPENMP)
+  list(APPEND BACKENDS OpenMPTarget)
+  list(APPEND Vtable_BACKENDS OpenMPTarget)
+endif()
+
+if(RAJA_ENABLE_CUDA)
+  list(APPEND BACKENDS Cuda)
+  list(APPEND Vtable_BACKENDS Cuda)
+endif()
+
+if(RAJA_ENABLE_HIP)
+  list(APPEND BACKENDS Hip)
+  if(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
+    list(APPEND Vtable_BACKENDS Hip)
+  endif()
+endif()
+
+# reduce travis build times with intel compiler
+if(RAJA_TEST_EXHAUSTIVE OR NOT RAJA_COMPILER MATCHES "RAJA_COMPILER_Intel")
+  set(Constructor_SUBTESTS Single)
+  buildunitworkgrouptest(Constructor "${Constructor_SUBTESTS}" "${BACKENDS}")
+
+  set(Enqueue_SUBTESTS Single Multiple)
+  buildunitworkgrouptest(Enqueue     "${Enqueue_SUBTESTS}"     "${BACKENDS}")
+
+  unset(Constructor_SUBTESTS)
+  unset(Enqueue_SUBTESTS)
+endif()
+
+set(Vtable_SUBTESTS Single)
+buildunitworkgrouptest(Vtable      "${Vtable_SUBTESTS}"      "${Vtable_BACKENDS}")
+
+set(WorkStorage_SUBTESTS Constructor Iterator InsertCall Multiple)
+buildunitworkgrouptest(WorkStorage "${WorkStorage_SUBTESTS}" "${WorkStorage_BACKENDS}")
+
+unset(Vtable_SUBTESTS)
+unset(WorkStorage_SUBTESTS)
+
+unset(BACKENDS)
+unset(Vtable_BACKENDS)
+unset(WorkStorage_BACKENDS)
diff --git a/test/unit/workgroup/test-workgroup-Constructor.cpp.in b/test/unit/workgroup/test-workgroup-Constructor.cpp.in
new file mode 100644
index 0000000000..a0f6aee9ad
--- /dev/null
+++ b/test/unit/workgroup/test-workgroup-Constructor.cpp.in
@@ -0,0 +1,27 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA workgroup constructor.
+///
+
+#include "test-workgroup-Constructor.hpp"
+
+using @BACKEND@BasicWorkGroupConstructor@SUBTESTNAME@Types =
+  Test< camp::cartesian_product< @BACKEND@ExecPolicyList,
+                                 @BACKEND@OrderPolicyList,
+                                 @BACKEND@StoragePolicyList,
+                                 IndexTypeTypeList,
+                                 XargsTypeList,
+                                 @BACKEND@AllocatorList > >::Types;
+
+REGISTER_TYPED_TEST_SUITE_P(WorkGroupBasicConstructor@SUBTESTNAME@UnitTest,
+                            BasicWorkGroupConstructor@SUBTESTNAME@);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@BasicTest,
+                               WorkGroupBasicConstructor@SUBTESTNAME@UnitTest,
+                               @BACKEND@BasicWorkGroupConstructor@SUBTESTNAME@Types);
diff --git a/test/unit/workgroup/test-workgroup-Enqueue.cpp.in b/test/unit/workgroup/test-workgroup-Enqueue.cpp.in
new file mode 100644
index 0000000000..d1e1d05f97
--- /dev/null
+++ b/test/unit/workgroup/test-workgroup-Enqueue.cpp.in
@@ -0,0 +1,27 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA workgroup enqueue.
+///
+
+#include "test-workgroup-Enqueue.hpp"
+
+using @BACKEND@BasicWorkGroupEnqueue@SUBTESTNAME@Types =
+  Test< camp::cartesian_product< @BACKEND@ExecPolicyList,
+                                 @BACKEND@OrderPolicyList,
+                                 @BACKEND@StoragePolicyList,
+                                 IndexTypeTypeList,
+                                 XargsTypeList,
+                                 @BACKEND@AllocatorList > >::Types;
+
+REGISTER_TYPED_TEST_SUITE_P(WorkGroupBasicEnqueue@SUBTESTNAME@UnitTest,
+                            BasicWorkGroupEnqueue@SUBTESTNAME@);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@BasicTest,
+                               WorkGroupBasicEnqueue@SUBTESTNAME@UnitTest,
+                               @BACKEND@BasicWorkGroupEnqueue@SUBTESTNAME@Types);
diff --git a/test/unit/workgroup/test-workgroup-Vtable.cpp.in b/test/unit/workgroup/test-workgroup-Vtable.cpp.in
new file mode 100644
index 0000000000..b649b27d78
--- /dev/null
+++ b/test/unit/workgroup/test-workgroup-Vtable.cpp.in
@@ -0,0 +1,26 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA workgroup vtable.
+///
+
+#include "test-workgroup-Vtable.hpp"
+
+using @BACKEND@BasicWorkGroupVtable@SUBTESTNAME@Types =
+  Test< camp::cartesian_product< @BACKEND@ExecPolicyList,
+                                 IndexTypeTypeList,
+                                 XargsTypeList,
+                                 @BACKEND@ResourceList,
+                                 @BACKEND@ForoneList > >::Types;
+
+REGISTER_TYPED_TEST_SUITE_P(WorkGroupBasicVtable@SUBTESTNAME@UnitTest,
+                            BasicWorkGroupVtable@SUBTESTNAME@);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@BasicTest,
+                               WorkGroupBasicVtable@SUBTESTNAME@UnitTest,
+                               @BACKEND@BasicWorkGroupVtable@SUBTESTNAME@Types);
diff --git a/test/unit/workgroup/test-workgroup-WorkStorage.cpp.in b/test/unit/workgroup/test-workgroup-WorkStorage.cpp.in
new file mode 100644
index 0000000000..f697327eb4
--- /dev/null
+++ b/test/unit/workgroup/test-workgroup-WorkStorage.cpp.in
@@ -0,0 +1,23 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA workgroup workstorage.
+///
+
+#include "test-workgroup-WorkStorage.hpp"
+
+using @BACKEND@BasicWorkGroupWorkStorage@SUBTESTNAME@Types =
+  Test< camp::cartesian_product< @BACKEND@StoragePolicyList,
+                                 WorkStorageAllocatorList > >::Types;
+
+REGISTER_TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorage@SUBTESTNAME@UnitTest,
+                            BasicWorkGroupWorkStorage@SUBTESTNAME@);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@BasicTest,
+                               WorkGroupBasicWorkStorage@SUBTESTNAME@UnitTest,
+                               @BACKEND@BasicWorkGroupWorkStorage@SUBTESTNAME@Types);
diff --git a/test/unit/workgroup/tests/test-workgroup-Constructor.hpp b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
new file mode 100644
index 0000000000..57acda2f4f
--- /dev/null
+++ b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
@@ -0,0 +1,91 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing tests for RAJA workgroup constructors.
+///
+
+#ifndef __TEST_WORKGROUP_CONSTRUCTOR__
+#define __TEST_WORKGROUP_CONSTRUCTOR__
+
+#include "RAJA_test-workgroup.hpp"
+
+
+template <typename ExecPolicy,
+          typename OrderPolicy,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename ... Xargs
+          >
+void testWorkGroupConstructorSingle(RAJA::xargs<Xargs...>)
+{
+  bool success = true;
+
+  {
+    RAJA::WorkPool<
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                    IndexType,
+                    RAJA::xargs<Xargs...>,
+                    Allocator
+                  >
+        pool(Allocator{});
+
+    ASSERT_EQ(pool.num_loops(), (size_t)0);
+    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+
+    RAJA::WorkGroup<
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                    IndexType,
+                    RAJA::xargs<Xargs...>,
+                    Allocator
+                  >
+        group = pool.instantiate();
+
+    ASSERT_EQ(pool.num_loops(), (size_t)0);
+    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+
+    RAJA::WorkSite<
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                    IndexType,
+                    RAJA::xargs<Xargs...>,
+                    Allocator
+                  >
+        site = group.run(Xargs{}...);
+
+    pool.clear();
+    group.clear();
+    site.clear();
+
+    ASSERT_EQ(pool.num_loops(), (size_t)0);
+    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+  }
+
+  ASSERT_TRUE(success);
+}
+
+template <typename T>
+class WorkGroupBasicConstructorSingleUnitTest : public ::testing::Test
+{
+};
+
+
+TYPED_TEST_SUITE_P(WorkGroupBasicConstructorSingleUnitTest);
+
+TYPED_TEST_P(WorkGroupBasicConstructorSingleUnitTest, BasicWorkGroupConstructorSingle)
+{
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<3>>::type;
+  using Xargs = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+
+  testWorkGroupConstructorSingle< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator >(Xargs{});
+}
+
+#endif  //__TEST_WORKGROUP_CONSTRUCTOR__
diff --git a/test/unit/workgroup/tests/test-workgroup-Enqueue.hpp b/test/unit/workgroup/tests/test-workgroup-Enqueue.hpp
new file mode 100644
index 0000000000..2accf213ed
--- /dev/null
+++ b/test/unit/workgroup/tests/test-workgroup-Enqueue.hpp
@@ -0,0 +1,150 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing tests for RAJA workgroup enqueue.
+///
+
+#ifndef __TEST_WORKGROUP_ENQUEUE__
+#define __TEST_WORKGROUP_ENQUEUE__
+
+#include "RAJA_test-workgroup.hpp"
+
+#include <random>
+
+
+template < typename IndexType,
+           typename ... Args >
+struct EnqueueTestCallable
+{
+  EnqueueTestCallable(IndexType* _ptr, IndexType _val)
+    : ptr(_ptr)
+    , val(_val)
+  { }
+
+  EnqueueTestCallable(EnqueueTestCallable const&) = default;
+  EnqueueTestCallable& operator=(EnqueueTestCallable const&) = default;
+
+  EnqueueTestCallable(EnqueueTestCallable&& o) = default;
+  EnqueueTestCallable& operator=(EnqueueTestCallable&& o) = default;
+
+  RAJA_HOST_DEVICE void operator()(IndexType i, Args... args) const
+  {
+    RAJA_UNUSED_VAR(args...);
+    ptr[i] = val;
+  }
+
+private:
+  IndexType* ptr;
+  IndexType  val;
+};
+
+
+template <typename ExecPolicy,
+          typename OrderPolicy,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename ... Args
+          >
+void testWorkGroupEnqueueMultiple(RAJA::xargs<Args...>, bool do_instantiate, size_t rep, size_t num)
+{
+  IndexType success = (IndexType)1;
+
+  using callable = EnqueueTestCallable<IndexType, Args...>;
+
+  using WorkPool_type = RAJA::WorkPool<
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                    IndexType,
+                    RAJA::xargs<Args...>,
+                    Allocator
+                  >;
+
+  using WorkGroup_type = RAJA::WorkGroup<
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                    IndexType,
+                    RAJA::xargs<Args...>,
+                    Allocator
+                  >;
+
+  {
+    WorkPool_type pool(Allocator{});
+
+    // test_empty(pool);
+    ASSERT_EQ(pool.num_loops(), (size_t)0);
+    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+
+    for (size_t i = 0; i < rep; ++i) {
+
+      {
+        for (size_t i = 0; i < num; ++i) {
+          pool.enqueue(RAJA::TypedRangeSegment<IndexType>{0, 1}, callable{&success, IndexType(0)});
+        }
+
+        ASSERT_EQ(pool.num_loops(), (size_t)num);
+        ASSERT_GE(pool.storage_bytes(), num*sizeof(callable));
+      }
+
+      if (do_instantiate) {
+        WorkGroup_type group = pool.instantiate();
+      } else {
+        pool.clear();
+      }
+
+      ASSERT_EQ(pool.num_loops(), (size_t)0);
+      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+    }
+  }
+
+  ASSERT_EQ(success, (IndexType)1);
+}
+
+
+template <typename T>
+class WorkGroupBasicEnqueueSingleUnitTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_SUITE_P(WorkGroupBasicEnqueueSingleUnitTest);
+
+template <typename T>
+class WorkGroupBasicEnqueueMultipleUnitTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_SUITE_P(WorkGroupBasicEnqueueMultipleUnitTest);
+
+TYPED_TEST_P(WorkGroupBasicEnqueueSingleUnitTest, BasicWorkGroupEnqueueSingle)
+{
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<3>>::type;
+  using Xargs = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+
+  testWorkGroupEnqueueMultiple< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator >(Xargs{}, false, 1, 1);
+  testWorkGroupEnqueueMultiple< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator >(Xargs{}, true, 1, 1);
+}
+
+TYPED_TEST_P(WorkGroupBasicEnqueueMultipleUnitTest, BasicWorkGroupEnqueueMultiple)
+{
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<3>>::type;
+  using Xargs = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+
+  std::mt19937 rng(std::random_device{}());
+  std::uniform_int_distribution<size_t> dist(0, 128);
+
+  testWorkGroupEnqueueMultiple< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator >(Xargs{}, false, dist(rng), dist(rng));
+  testWorkGroupEnqueueMultiple< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator >(Xargs{}, true, dist(rng), dist(rng));
+}
+
+#endif  //__TEST_WORKGROUP_ENQUEUE__
diff --git a/test/unit/workgroup/tests/test-workgroup-Vtable.hpp b/test/unit/workgroup/tests/test-workgroup-Vtable.hpp
new file mode 100644
index 0000000000..cd2ae77272
--- /dev/null
+++ b/test/unit/workgroup/tests/test-workgroup-Vtable.hpp
@@ -0,0 +1,224 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing tests for RAJA workgroup constructors.
+///
+
+#ifndef __TEST_WORKGROUP_VTABLE__
+#define __TEST_WORKGROUP_VTABLE__
+
+#include "RAJA_test-workgroup.hpp"
+
+
+template  < typename ForOnePol,
+            typename ... CallArgs >
+typename  std::enable_if<
+            !std::is_base_of<RunOnDevice, ForOnePol>::value
+          >::type
+call_dispatcher( void(*call_function)(CallArgs...),
+                 CallArgs... callArgs )
+{
+  forone<ForOnePol>( [=] () {
+    call_function(callArgs...);
+  });
+}
+
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+template  < typename ForOnePol,
+            typename ... CallArgs >
+typename  std::enable_if<
+            std::is_base_of<RunOnDevice, ForOnePol>::value
+          >::type
+call_dispatcher( void(*call_function)(CallArgs...),
+                 CallArgs... callArgs )
+{
+  RAJA::tuple<CallArgs...> callArgs_device_lambda_workaround(callArgs...);
+  forone<ForOnePol>( [=] RAJA_DEVICE () {
+    camp::invoke(callArgs_device_lambda_workaround, call_function);
+  });
+}
+#endif
+
+template < typename IndexType,
+           typename ... Args >
+struct VtableTestCallable
+{
+  VtableTestCallable(IndexType* _ptr_call, IndexType _val_call,
+                     IndexType* _ptr_dtor, IndexType _val_dtor)
+    : ptr_call(_ptr_call)
+    , val_call(_val_call)
+    , ptr_dtor(_ptr_dtor)
+    , val_dtor(_val_dtor)
+  { }
+
+  VtableTestCallable(VtableTestCallable const&) = delete;
+  VtableTestCallable& operator=(VtableTestCallable const&) = delete;
+
+  VtableTestCallable(VtableTestCallable&& o)
+    : ptr_call(o.ptr_call)
+    , val_call(o.val_call)
+    , ptr_dtor(o.ptr_dtor)
+    , val_dtor(o.val_dtor)
+    , move_constructed(true)
+  {
+    o.moved_from = true;
+  }
+  VtableTestCallable& operator=(VtableTestCallable&& o)
+  {
+    ptr_call = o.ptr_call;
+    val_call = o.val_call;
+    ptr_dtor = o.ptr_dtor;
+    val_dtor = o.val_dtor;
+    o.moved_from = true;
+    return *this;
+  }
+
+  ~VtableTestCallable()
+  {
+    *ptr_dtor = val_dtor;
+  }
+
+  RAJA_HOST_DEVICE void operator()(IndexType i, Args... args) const
+  {
+    RAJA_UNUSED_VAR(args...);
+    ptr_call[i] = val_call;
+  }
+
+private:
+  IndexType* ptr_call;
+  IndexType  val_call;
+  IndexType* ptr_dtor;
+  IndexType  val_dtor;
+public:
+  bool move_constructed = false;
+  bool moved_from = false;
+};
+
+template < typename ExecPolicy,
+           typename IndexType,
+           typename WORKING_RES,
+           typename ForOnePol,
+           typename ... Args >
+void testWorkGroupVtableSingle(RAJA::xargs<Args...>)
+{
+  using TestCallable = VtableTestCallable<IndexType, Args...>;
+
+  camp::resources::Resource work_res{WORKING_RES()};
+  camp::resources::Resource host_res{camp::resources::Host()};
+
+  using Vtable_type = RAJA::detail::Vtable<IndexType, Args...>;
+  const Vtable_type* vtable =
+      RAJA::detail::get_Vtable<TestCallable, Vtable_type>(ExecPolicy{});
+
+  TestCallable* old_obj = host_res.allocate<TestCallable>(1);
+  TestCallable* new_obj = host_res.allocate<TestCallable>(1);
+  TestCallable* wrk_obj = work_res.allocate<TestCallable>(1);
+
+  IndexType* chckCall = host_res.allocate<IndexType>(3);
+  IndexType* testCall = host_res.allocate<IndexType>(3);
+  IndexType* workCall = work_res.allocate<IndexType>(3);
+
+  IndexType* chckDtor = host_res.allocate<IndexType>(3);
+  IndexType* testDtor = host_res.allocate<IndexType>(3);
+
+
+  chckCall[0] = (IndexType)5;
+  chckCall[1] = (IndexType)7;
+  chckCall[2] = (IndexType)5;
+
+  testCall[0] = (IndexType)5;
+  testCall[1] = (IndexType)5;
+  testCall[2] = (IndexType)5;
+
+  work_res.memcpy(workCall, testCall, sizeof(IndexType) * 3);
+
+  testCall[0] = (IndexType)0;
+  testCall[1] = (IndexType)0;
+  testCall[2] = (IndexType)0;
+
+
+  chckDtor[0] = (IndexType)15;
+  chckDtor[1] = (IndexType)17;
+  chckDtor[2] = (IndexType)15;
+
+  testDtor[0] = (IndexType)15;
+  testDtor[1] = (IndexType)15;
+  testDtor[2] = (IndexType)15;
+
+
+  new(old_obj) TestCallable(workCall, chckCall[1], testDtor+1, chckDtor[1]);
+
+  ASSERT_FALSE(old_obj->move_constructed);
+  ASSERT_FALSE(old_obj->moved_from);
+
+
+  vtable->move_construct_destroy_function_ptr(new_obj, old_obj);
+
+  ASSERT_TRUE(new_obj->move_constructed);
+  ASSERT_FALSE(new_obj->moved_from);
+
+  ASSERT_EQ(testDtor[0], chckDtor[0]);
+  ASSERT_EQ(testDtor[1], chckDtor[1]);
+  ASSERT_EQ(testDtor[2], chckDtor[2]);
+
+  testDtor[0] = (IndexType)15;
+  testDtor[1] = (IndexType)15;
+  testDtor[2] = (IndexType)15;
+
+
+  work_res.memcpy(wrk_obj, new_obj, sizeof(TestCallable) * 1);
+
+  // move a value onto device and fiddle
+  call_dispatcher<ForOnePol, const void*, IndexType, Args...>(
+      vtable->call_function_ptr, wrk_obj, (IndexType)1, Args{}...);
+
+  work_res.memcpy(testCall, workCall, sizeof(IndexType) * 3);
+
+  ASSERT_EQ(testCall[0], chckCall[0]);
+  ASSERT_EQ(testCall[1], chckCall[1]);
+  ASSERT_EQ(testCall[2], chckCall[2]);
+
+
+  vtable->destroy_function_ptr(new_obj);
+
+  ASSERT_EQ(testDtor[0], chckDtor[0]);
+  ASSERT_EQ(testDtor[1], chckDtor[1]);
+  ASSERT_EQ(testDtor[2], chckDtor[2]);
+
+
+  host_res.deallocate( old_obj );
+  host_res.deallocate( new_obj );
+  work_res.deallocate( wrk_obj );
+  host_res.deallocate( chckCall );
+  host_res.deallocate( testCall );
+  work_res.deallocate( workCall );
+  host_res.deallocate( chckDtor );
+  host_res.deallocate( testDtor );
+}
+
+
+template <typename T>
+class WorkGroupBasicVtableSingleUnitTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_SUITE_P(WorkGroupBasicVtableSingleUnitTest);
+
+TYPED_TEST_P(WorkGroupBasicVtableSingleUnitTest, BasicWorkGroupVtableSingle)
+{
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using Args = typename camp::at<TypeParam, camp::num<2>>::type;
+  using ResourceType = typename camp::at<TypeParam, camp::num<3>>::type;
+  using ForOneType = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  testWorkGroupVtableSingle< ExecPolicy, IndexType, ResourceType, ForOneType >(
+      Args{});
+}
+
+#endif  //__TEST_WORKGROUP_VTABLE__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage.hpp
new file mode 100644
index 0000000000..364aa56e69
--- /dev/null
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage.hpp
@@ -0,0 +1,554 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing tests for RAJA workgroup constructors.
+///
+
+#ifndef __TEST_WORKGROUP_WORKSTORAGE__
+#define __TEST_WORKGROUP_WORKSTORAGE__
+
+#include "RAJA_test-workgroup.hpp"
+
+#include <random>
+#include <array>
+#include <cstddef>
+
+
+template < typename T >
+struct TestCallable
+{
+  TestCallable(T _val)
+    : val(_val)
+  { }
+
+  TestCallable(TestCallable const&) = delete;
+  TestCallable& operator=(TestCallable const&) = delete;
+
+  TestCallable(TestCallable&& o)
+    : val(o.val)
+    , move_constructed(true)
+  {
+    o.moved_from = true;
+  }
+
+  TestCallable& operator=(TestCallable&& o)
+  {
+    val = o.val;
+    o.moved_from = true;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE void operator()(
+      void* val_ptr, bool* move_constructed_ptr, bool* moved_from_ptr) const
+  {
+    *static_cast<T*>(val_ptr) = val;
+    *move_constructed_ptr = move_constructed;
+    *moved_from_ptr = moved_from;
+  }
+
+private:
+  T val;
+public:
+  bool move_constructed = false;
+  bool moved_from = false;
+};
+
+
+template <typename StoragePolicy,
+          typename Allocator
+          >
+void testWorkGroupWorkStorageConstructor()
+{
+  bool success = true;
+
+  using Vtable_type = RAJA::detail::Vtable<void*, bool*, bool*>;
+  using WorkStorage_type = RAJA::detail::WorkStorage<
+                                                      StoragePolicy,
+                                                      Allocator,
+                                                      Vtable_type
+                                                    >;
+
+  {
+    auto test_empty = [&](WorkStorage_type& container) {
+
+      ASSERT_EQ(container.size(), (size_t)(0));
+      ASSERT_EQ(container.storage_size(), (size_t)0);
+    };
+
+    WorkStorage_type container(Allocator{});
+
+    test_empty(container);
+
+    container.clear();
+
+    test_empty(container);
+
+
+    WorkStorage_type container2(std::move(container));
+
+    test_empty(container);
+    test_empty(container2);
+
+
+    WorkStorage_type container3(Allocator{});
+    container3 = std::move(container2);
+
+    test_empty(container2);
+    test_empty(container3);
+  }
+
+  ASSERT_TRUE(success);
+}
+
+
+template <typename StoragePolicy,
+          typename Allocator
+          >
+void testWorkGroupWorkStorageIterator()
+{
+  bool success = true;
+
+  using Vtable_type = RAJA::detail::Vtable<void*, bool*, bool*>;
+  using WorkStorage_type = RAJA::detail::WorkStorage<
+                                                      StoragePolicy,
+                                                      Allocator,
+                                                      Vtable_type
+                                                    >;
+
+  using callable = TestCallable<int>;
+
+  const Vtable_type* vtable = RAJA::detail::get_Vtable<
+      callable, Vtable_type>(RAJA::seq_work{});
+
+  {
+    WorkStorage_type container(Allocator{});
+
+    ASSERT_EQ(container.end()-container.begin(), (std::ptrdiff_t)0);
+    ASSERT_FALSE(container.begin() < container.end());
+    ASSERT_FALSE(container.begin() > container.end());
+    ASSERT_TRUE(container.begin() == container.end());
+    ASSERT_FALSE(container.begin() != container.end());
+    ASSERT_TRUE(container.begin() <= container.end());
+    ASSERT_TRUE(container.begin() >= container.end());
+
+    container.template emplace<callable>(vtable, callable{-1});
+
+    ASSERT_EQ(container.end()-container.begin(), (std::ptrdiff_t)1);
+    ASSERT_TRUE(container.begin() < container.end());
+    ASSERT_FALSE(container.begin() > container.end());
+    ASSERT_FALSE(container.begin() == container.end());
+    ASSERT_TRUE(container.begin() != container.end());
+    ASSERT_TRUE(container.begin() <= container.end());
+    ASSERT_FALSE(container.begin() >= container.end());
+
+    {
+      auto iter = container.begin();
+
+      ASSERT_EQ(&*iter, &iter[0]);
+
+      ASSERT_EQ(iter++, container.begin());
+      ASSERT_EQ(iter--, container.end());
+      ASSERT_EQ(++iter, container.end());
+      ASSERT_EQ(--iter, container.begin());
+
+      ASSERT_EQ(iter+1, container.end());
+      ASSERT_EQ(1+iter, container.end());
+      ASSERT_EQ(++iter, container.end());
+      ASSERT_EQ(iter-1, container.begin());
+      ASSERT_EQ(iter-=1, container.begin());
+      ASSERT_EQ(iter+=1, container.end());
+    }
+  }
+
+  ASSERT_TRUE(success);
+}
+
+
+template <typename StoragePolicy,
+          typename Allocator
+          >
+void testWorkGroupWorkStorageInsertCall()
+{
+  bool success = true;
+
+  using Vtable_type = RAJA::detail::Vtable<void*, bool*, bool*>;
+  using WorkStorage_type = RAJA::detail::WorkStorage<
+                                                      StoragePolicy,
+                                                      Allocator,
+                                                      Vtable_type
+                                                    >;
+  using WorkStruct_type = typename WorkStorage_type::value_type;
+
+  using callable = TestCallable<double>;
+
+  const Vtable_type* vtable = RAJA::detail::get_Vtable<
+      callable, Vtable_type>(RAJA::seq_work{});
+
+  {
+    auto test_empty = [&](WorkStorage_type& container) {
+
+      ASSERT_EQ(container.size(), (size_t)(0));
+      ASSERT_EQ(container.storage_size(), (size_t)0);
+    };
+
+    auto fill_contents = [&](WorkStorage_type& container, double init_val) {
+
+      callable c(init_val);
+
+      ASSERT_FALSE(c.move_constructed);
+      ASSERT_FALSE(c.moved_from);
+
+      container.template emplace<callable>(vtable, std::move(c));
+
+      ASSERT_FALSE(c.move_constructed);
+      ASSERT_TRUE(c.moved_from);
+
+      ASSERT_EQ(container.size(), (size_t)1);
+      ASSERT_TRUE(container.storage_size() >= sizeof(callable));
+    };
+
+    auto test_contents = [&](WorkStorage_type& container, double init_val) {
+
+      ASSERT_EQ(container.size(), (size_t)1);
+      ASSERT_TRUE(container.storage_size() >= sizeof(callable));
+
+      auto iter = container.begin();
+
+      double test_val = -1;
+      bool move_constructed = false;
+      bool moved_from = true;
+      WorkStruct_type::call(&*iter, (void*)&test_val, &move_constructed, &moved_from);
+
+      ASSERT_EQ(test_val, init_val);
+      ASSERT_TRUE(move_constructed);
+      ASSERT_FALSE(moved_from);
+    };
+
+
+    WorkStorage_type container(Allocator{});
+
+    test_empty(container);
+
+    container.clear();
+
+    test_empty(container);
+    fill_contents(container, 1.23456789);
+    test_contents(container, 1.23456789);
+
+
+    WorkStorage_type container2(std::move(container));
+
+    test_empty(container);
+    test_contents(container2, 1.23456789);
+
+
+    WorkStorage_type container3(Allocator{});
+    container3 = std::move(container2);
+
+    test_empty(container2);
+    test_contents(container3, 1.23456789);
+
+
+    WorkStorage_type container4(Allocator{});
+
+    fill_contents(container4, 2.34567891);
+    test_contents(container4, 2.34567891);
+
+    container4 = std::move(container3);
+
+    test_empty(container3);
+    test_contents(container4, 1.23456789);
+  }
+
+  ASSERT_TRUE(success);
+}
+
+// work around inconsistent std::array support over stl versions
+template < typename T, size_t N >
+struct TestArray
+{
+  T a[N]{};
+  T& operator[](size_t i) { return a[i]; }
+  T const& operator[](size_t i) const { return a[i]; }
+  friend inline bool operator==(TestArray const& lhs, TestArray const& rhs)
+  {
+    for (size_t i = 0; i < N; ++i) {
+      if (lhs[i] == rhs[i]) continue;
+      else return false;
+    }
+    return true;
+  }
+  friend inline bool operator!=(TestArray const& lhs, TestArray const& rhs)
+  {
+    return !(lhs == rhs);
+  }
+};
+
+template <typename StoragePolicy,
+          typename Allocator
+          >
+void testWorkGroupWorkStorageMultiple(
+    const size_t num0, const size_t num1, const size_t num2)
+{
+  bool success = true;
+
+  using Vtable_type = RAJA::detail::Vtable<void*, bool*, bool*>;
+  using WorkStorage_type = RAJA::detail::WorkStorage<
+                                                      StoragePolicy,
+                                                      Allocator,
+                                                      Vtable_type
+                                                    >;
+  using WorkStruct_type = typename WorkStorage_type::value_type;
+
+  using type0 = double;
+  using type1 = TestArray<double, 6>;
+  using type2 = TestArray<double, 14>;
+
+  auto make_type0 = [](double init_val, size_t i) {
+    type0 obj(init_val - (double)i);
+    return obj;
+  };
+  auto make_type1 = [](double init_val, size_t i) {
+    type1 obj{};
+    for (size_t j = 0; j < 6; ++j) {
+      obj[j] = init_val + 10.0 * j + i;
+    }
+    return obj;
+  };
+  auto make_type2 = [](double init_val, size_t i) {
+    type2 obj{};
+    for (size_t j = 0; j < 14; ++j) {
+      obj[j] = init_val + 10.0 * j + i;
+    }
+    return obj;
+  };
+
+  using callable0 = TestCallable<type0>;
+  using callable1 = TestCallable<type1>;
+  using callable2 = TestCallable<type2>;
+
+  const Vtable_type* vtable0 = RAJA::detail::get_Vtable<
+      callable0, Vtable_type>(RAJA::seq_work{});
+  const Vtable_type* vtable1 = RAJA::detail::get_Vtable<
+      callable1, Vtable_type>(RAJA::seq_work{});
+  const Vtable_type* vtable2 = RAJA::detail::get_Vtable<
+      callable2, Vtable_type>(RAJA::seq_work{});
+
+  {
+    auto test_empty = [&](WorkStorage_type& container) {
+
+      ASSERT_EQ(container.size(), (size_t)(0));
+      ASSERT_EQ(container.storage_size(), (size_t)0);
+    };
+
+    auto fill_contents = [&](WorkStorage_type& container, double init_val0, double init_val1, double init_val2) {
+
+      std::vector<callable0> vec0;
+      vec0.reserve(num0);
+      for (size_t i = 0; i < num0; ++i) {
+        vec0.emplace_back(make_type0(init_val0, i));
+        ASSERT_FALSE(vec0[i].move_constructed);
+        ASSERT_FALSE(vec0[i].moved_from);
+        container.template emplace<callable0>(vtable0, std::move(vec0[i]));
+        ASSERT_FALSE(vec0[i].move_constructed);
+        ASSERT_TRUE (vec0[i].moved_from);
+      }
+
+      std::vector<callable1> vec1;
+      vec1.reserve(num1);
+      for (size_t i = 0; i < num1; ++i) {
+        vec1.emplace_back(make_type1(init_val1, i));
+        ASSERT_FALSE(vec1[i].move_constructed);
+        ASSERT_FALSE(vec1[i].moved_from);
+        container.template emplace<callable1>(vtable1, std::move(vec1[i]));
+        ASSERT_FALSE(vec1[i].move_constructed);
+        ASSERT_TRUE (vec1[i].moved_from);
+      }
+
+      std::vector<callable2> vec2;
+      vec2.reserve(num2);
+      for (size_t i = 0; i < num2; ++i) {
+        vec2.emplace_back(make_type2(init_val2, i));
+        ASSERT_FALSE(vec2[i].move_constructed);
+        ASSERT_FALSE(vec2[i].moved_from);
+        container.template emplace<callable2>(vtable2, std::move(vec2[i]));
+        ASSERT_FALSE(vec2[i].move_constructed);
+        ASSERT_TRUE (vec2[i].moved_from);
+      }
+
+      ASSERT_EQ(container.size(), num0+num1+num2);
+      ASSERT_GE(container.storage_size(),
+          num0*sizeof(callable0) +
+          num1*sizeof(callable1) +
+          num2*sizeof(callable2));
+    };
+
+    auto test_contents = [&](WorkStorage_type& container, double init_val0, double init_val1, double init_val2) {
+
+      ASSERT_EQ(container.size(), num0+num1+num2);
+      ASSERT_GE(container.storage_size(),
+          num0*sizeof(callable0) +
+          num1*sizeof(callable1) +
+          num2*sizeof(callable2));
+
+      {
+        auto iter = container.begin();
+
+        for (size_t i = 0; i < num0; ++i) {
+          type0 val{};
+          bool move_constructed = false;
+          bool moved_from = true;
+          WorkStruct_type::call(&*iter, (void*)&val, &move_constructed, &moved_from);
+
+          type0 expected = make_type0(init_val0, i);
+          ASSERT_EQ(val, expected);
+          ASSERT_TRUE(move_constructed);
+          ASSERT_FALSE(moved_from);
+
+          ++iter;
+        }
+
+        for (size_t i = 0; i < num1; ++i) {
+          type1 val{};
+          bool move_constructed = false;
+          bool moved_from = true;
+          WorkStruct_type::call(&*iter, (void*)&val, &move_constructed, &moved_from);
+
+          type1 expected = make_type1(init_val1, i);
+          ASSERT_EQ(val, expected);
+          ASSERT_TRUE(move_constructed);
+          ASSERT_FALSE(moved_from);
+
+          ++iter;
+        }
+
+        for (size_t i = 0; i < num2; ++i) {
+          type2 val{};
+          bool move_constructed = false;
+          bool moved_from = true;
+          WorkStruct_type::call(&*iter, (void*)&val, &move_constructed, &moved_from);
+
+          type2 expected = make_type2(init_val2, i);
+          ASSERT_EQ(val, expected);
+          ASSERT_TRUE(move_constructed);
+          ASSERT_FALSE(moved_from);
+
+          ++iter;
+        }
+
+        ASSERT_EQ(iter, container.end());
+      }
+    };
+
+    WorkStorage_type container(Allocator{});
+
+    test_empty(container);
+    fill_contents(container, 1.0, 100.0, 1000.0);
+
+    container.clear();
+
+    test_empty(container);
+    fill_contents(container, 1.0, 100.0, 1000.0);
+    test_contents(container, 1.0, 100.0, 1000.0);
+
+
+    WorkStorage_type container2(std::move(container));
+
+    test_empty(container);
+    test_contents(container2, 1.0, 100.0, 1000.0);
+
+
+    WorkStorage_type container3(Allocator{});
+    container3 = std::move(container2);
+
+    test_empty(container2);
+    test_contents(container3, 1.0, 100.0, 1000.0);
+
+
+    WorkStorage_type container4(Allocator{});
+
+    fill_contents(container4, 1.5, 100.5, 1000.5);
+    test_contents(container4, 1.5, 100.5, 1000.5);
+
+    container4 = std::move(container3);
+
+    test_empty(container3);
+    test_contents(container4, 1.0, 100.0, 1000.0);
+
+  }
+
+  ASSERT_TRUE(success);
+}
+
+
+template <typename T>
+class WorkGroupBasicWorkStorageConstructorUnitTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageConstructorUnitTest);
+
+template <typename T>
+class WorkGroupBasicWorkStorageIteratorUnitTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageIteratorUnitTest);
+
+template <typename T>
+class WorkGroupBasicWorkStorageInsertCallUnitTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageInsertCallUnitTest);
+
+template <typename T>
+class WorkGroupBasicWorkStorageMultipleUnitTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageMultipleUnitTest);
+
+
+TYPED_TEST_P(WorkGroupBasicWorkStorageConstructorUnitTest, BasicWorkGroupWorkStorageConstructor)
+{
+  using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  testWorkGroupWorkStorageConstructor< StoragePolicy, Allocator >();
+}
+
+TYPED_TEST_P(WorkGroupBasicWorkStorageIteratorUnitTest, BasicWorkGroupWorkStorageIterator)
+{
+  using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  testWorkGroupWorkStorageIterator< StoragePolicy, Allocator >();
+}
+
+TYPED_TEST_P(WorkGroupBasicWorkStorageInsertCallUnitTest, BasicWorkGroupWorkStorageInsertCall)
+{
+  using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  testWorkGroupWorkStorageInsertCall< StoragePolicy, Allocator >();
+}
+
+TYPED_TEST_P(WorkGroupBasicWorkStorageMultipleUnitTest, BasicWorkGroupWorkStorageMultiple)
+{
+  using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  std::mt19937 rng(std::random_device{}());
+  std::uniform_int_distribution<size_t> dist(0, 128);
+
+  testWorkGroupWorkStorageMultiple< StoragePolicy, Allocator >(
+      dist(rng), dist(rng), dist(rng));
+}
+
+#endif  //__TEST_WORKGROUP_WORKSTORAGE__
diff --git a/tpl/camp b/tpl/camp
index 292f0bfde0..7600370c17 160000
--- a/tpl/camp
+++ b/tpl/camp
@@ -1 +1 @@
-Subproject commit 292f0bfde04f082af911a4fd221daab60f05c817
+Subproject commit 7600370c17f59238c5e22bafc19e50d9286ad456
diff --git a/tpl/rocPRIM b/tpl/rocPRIM
index f8dc238ca6..b85751baa8 160000
--- a/tpl/rocPRIM
+++ b/tpl/rocPRIM
@@ -1 +1 @@
-Subproject commit f8dc238ca6956490a64e415b50ecf4d8abcac50a
+Subproject commit b85751baa8f216a16dfca4fdd85c74b2674b18b6
diff --git a/travis-data/docker/hip/Dockerfile b/travis-data/docker/hip/Dockerfile
index faa8029766..364d428d5d 100644
--- a/travis-data/docker/hip/Dockerfile
+++ b/travis-data/docker/hip/Dockerfile
@@ -1,4 +1,4 @@
-FROM rocm/dev-ubuntu-16.04
+FROM rocm/dev-ubuntu-16.04:3.5.1
 
 LABEL maintainer="David Beckingsale <david@llnl.gov>"