diff --git a/include/RAJA/pattern/launch/launch_context_policy.hpp b/include/RAJA/pattern/launch/launch_context_policy.hpp new file mode 100644 index 0000000000..acf707615d --- /dev/null +++ b/include/RAJA/pattern/launch/launch_context_policy.hpp @@ -0,0 +1,124 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file containing template types of RAJA::LaunchContextT + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_pattern_context_policy_HPP +#define RAJA_pattern_context_policy_HPP + +namespace RAJA +{ + +template +class LaunchContextT; + +class LaunchContextDefaultPolicy; + +#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) +class LaunchContextDim3Policy; +#endif + +namespace detail +{ + + +template +struct has_single_call_operator : std::false_type +{}; + +template +struct has_single_call_operator< + T, + std::enable_if_t< + !std::is_same_v::operator()), void>>> + : std::true_type +{}; + +template +struct function_traits +{}; + +template +struct function_traits +{ + using result_type = R; + static constexpr std::size_t arity = sizeof...(Args); + + template + struct arg + { + static_assert(N < arity, "argument index out of range"); + using type = typename std::tuple_element>::type; + }; +}; + +template +struct function_traits : function_traits +{}; + +template +struct function_traits : function_traits +{}; + +template +struct function_traits : function_traits +{ + using functional_type = C; +}; + +template +struct function_traits : function_traits +{ + using functional_type = C; +}; + +template>::value> +struct functional_traits : function_traits> +{}; + +template +struct functional_traits + : function_traits::operator())> +{}; + +template +struct has_arg0 : std::false_type +{}; + +template +struct has_arg0::template arg<0>::type, + void>>> : std::true_type +{}; + +template::value> +struct launch_context_type +{ + using type = LaunchContextT; +}; + +template +struct launch_context_type +{ + using type = typename functional_traits::template arg<0>::type; +}; + + +} // namespace detail + +} // namespace RAJA +#endif diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index 51df43471a..39739556cc 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -22,6 +22,7 @@ #include "RAJA/config.hpp" #include "RAJA/internal/get_platform.hpp" +#include "RAJA/pattern/launch/launch_context_policy.hpp" #include "RAJA/util/StaticLayout.hpp" #include "RAJA/util/macros.hpp" #include "RAJA/util/plugins.hpp" @@ -178,13 +179,12 @@ struct LaunchParams Threads apply(Threads const& a) { return (threads = a); } }; -class LaunchContext +class LaunchContextBase { public: // Bump style allocator used to // get memory from the pool size_t shared_mem_offset; - void* shared_mem_ptr; #if defined(RAJA_SYCL_ACTIVE) @@ -192,7 +192,7 @@ class LaunchContext mutable ::sycl::nd_item<3>* itm; #endif - RAJA_HOST_DEVICE LaunchContext() + RAJA_HOST_DEVICE LaunchContextBase() : shared_mem_offset(0), shared_mem_ptr(nullptr) {} @@ -211,20 +211,6 @@ class LaunchContext return static_cast(mem_ptr); } - /* - //Odd dependecy with atomics is breaking CI builds - template RAJA_HOST_DEVICE auto - getSharedMemoryView(size_t bytes, arg idx, args... idxs) - { - T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset]; - - shared_mem_offset += bytes*sizeof(T); - return RAJA::View>(mem_ptr, idx, - idxs...); - } - */ - RAJA_HOST_DEVICE void releaseSharedMemory() { // On the cpu/gpu we want to restart the count @@ -245,6 +231,39 @@ class LaunchContext } }; +template<> +class LaunchContextT : public LaunchContextBase +{ +public: + static constexpr bool hasDim3 = false; + + using LaunchContextBase::LaunchContextBase; +}; + +// Preserve backwards compatibility +using LaunchContext = LaunchContextT; + +#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) +template<> +class LaunchContextT : public LaunchContextBase +{ +public: + static constexpr bool hasDim3 = true; + + dim3 thread_id; + dim3 block_dim; + + LaunchContextT() : LaunchContextBase(), thread_id(), block_dim() {} + + RAJA_DEVICE + LaunchContextT(dim3 thread, dim3 block) + : LaunchContextBase(), + thread_id(thread), + block_dim(block) + {} +}; +#endif + template struct LaunchExecute; diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index a38ba15c89..aac23473ec 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -35,17 +35,28 @@ __global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY body_in, ReduceParams reduce_params) { - LaunchContext ctx; - using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); auto& body = privatizer.get_priv(); // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - ctx.shared_mem_ptr = raja_shmem_ptr; - RAJA::expt::invoke_body(reduce_params, body, ctx); + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; + + if constexpr (LaunchContextType::hasDim3) + { + LaunchContextType ctx(threadIdx, blockDim); + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } + else + { + LaunchContextType ctx; + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } // Using a flatten global policy as we may use all dimensions RAJA::expt::ParamMultiplexer::parampack_combine( @@ -142,7 +153,6 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ body_in, ReduceParams reduce_params) { - LaunchContext ctx; using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); @@ -150,9 +160,22 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - ctx.shared_mem_ptr = raja_shmem_ptr; - RAJA::expt::invoke_body(reduce_params, body, ctx); + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; + + if constexpr (LaunchContextType::hasDim3) + { + LaunchContextType ctx(threadIdx, blockDim); + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } + else + { + LaunchContextType ctx; + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } // Using a flatten global policy as we may use all dimensions RAJA::expt::ParamMultiplexer::parampack_combine( @@ -241,6 +264,65 @@ struct LaunchExecute< } }; +/* + Loop methods which rely on a copy of threaIdx/BlockDim + for performance. In collaboration with AMD we have have this + to be more performat. +*/ + +namespace expt +{ + +template +struct cuda_ctx_thread_loop; + +using cuda_ctx_thread_loop_x = cuda_ctx_thread_loop; +using cuda_ctx_thread_loop_y = cuda_ctx_thread_loop; +using cuda_ctx_thread_loop_z = cuda_ctx_thread_loop; + +template +RAJA_INLINE RAJA_DEVICE int get_dim(Dim3Like const& d) +{ + if constexpr (DIM == named_dim::x) + { + return d.x; + } + else if constexpr (DIM == named_dim::y) + { + return d.y; + } + else + { + static_assert(DIM == named_dim::z, "Unsupported named_dim"); + return d.z; + } +} + +} // namespace expt + +template +struct LoopExecute, SEGMENT> +{ + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContextT const& ctx, + SEGMENT const& segment, + BODY const& body) + { + const int len = segment.end() - segment.begin(); + constexpr int int_dim = static_cast(DIM); + + const int thread_idx = expt::get_dim(ctx.thread_id); + const int stride = expt::get_dim(ctx.block_dim); + + for (int i = thread_idx; i < len; i += stride) + { + body(*(segment.begin() + i)); + } + } +}; + /* CUDA generic loop implementations */ @@ -337,9 +419,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -365,9 +447,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -401,9 +483,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -437,9 +519,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -467,9 +549,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -512,9 +594,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -563,9 +645,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -587,9 +669,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -617,9 +699,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -645,9 +727,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -673,9 +755,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -709,9 +791,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -745,9 +827,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -775,9 +857,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -820,9 +902,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -884,9 +966,9 @@ struct LoopExecute::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -916,9 +998,9 @@ struct LoopExecute::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -958,9 +1040,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -995,9 +1077,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1046,9 +1128,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1083,9 +1165,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1121,9 +1203,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1147,9 +1229,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1182,9 +1264,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1217,9 +1299,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1247,9 +1329,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1288,9 +1370,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1331,9 +1413,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1364,9 +1446,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1414,9 +1496,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1472,9 +1554,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1498,9 +1580,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1534,9 +1616,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1570,9 +1652,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1600,9 +1682,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1642,9 +1724,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1686,9 +1768,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1719,9 +1801,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1773,9 +1855,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index 14ce600341..f52d69bce6 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -34,7 +34,6 @@ template __global__ void launch_new_reduce_global_fcn(const BODY body_in, ReduceParams reduce_params) { - LaunchContext ctx; using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); @@ -42,9 +41,22 @@ __global__ void launch_new_reduce_global_fcn(const BODY body_in, // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - ctx.shared_mem_ptr = raja_shmem_ptr; - RAJA::expt::invoke_body(reduce_params, body, ctx); + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; + + if constexpr (LaunchContextType::hasDim3) + { + LaunchContextType ctx(threadIdx, blockDim); + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } + else + { + LaunchContextType ctx; + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } // Using a flatten global policy as we may use all dimensions RAJA::expt::ParamMultiplexer::parampack_combine( @@ -136,7 +148,6 @@ __launch_bounds__(num_threads, 1) __global__ void launch_new_reduce_global_fcn_fixed(const BODY body_in, ReduceParams reduce_params) { - LaunchContext ctx; using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); @@ -144,9 +155,22 @@ __launch_bounds__(num_threads, 1) __global__ // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - ctx.shared_mem_ptr = raja_shmem_ptr; - RAJA::expt::invoke_body(reduce_params, body, ctx); + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; + + if constexpr (LaunchContextType::hasDim3) + { + LaunchContextType ctx(threadIdx, blockDim); + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } + else + { + LaunchContextType ctx; + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } // Using a flatten global policy as we may use all dimensions RAJA::expt::ParamMultiplexer::parampack_combine( @@ -235,6 +259,70 @@ struct LaunchExecute> } }; +/* + Loop methods which rely on a copy of threaIdx/BlockDim + for performance. In collaboration with AMD we have have this + to be more performant. +*/ + +namespace expt +{ + +template +struct hip_ctx_thread_loop; + +using hip_ctx_thread_loop_x = hip_ctx_thread_loop; +using hip_ctx_thread_loop_y = hip_ctx_thread_loop; +using hip_ctx_thread_loop_z = hip_ctx_thread_loop; + +template +RAJA_INLINE RAJA_DEVICE int get_dim(Dim3Like const& d) +{ + if constexpr (DIM == named_dim::x) + { + return d.x; + } + else if constexpr (DIM == named_dim::y) + { + return d.y; + } + else + { + static_assert(DIM == named_dim::z, "Unsupported named_dim"); + return d.z; + } +} + +} // namespace expt + +/* + Loop exec methods will have to be reworked to be hasDim3 aware +*/ + +template +struct LoopExecute, SEGMENT> +{ + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContextT const& ctx, + SEGMENT const& segment, + BODY const& body) + { + + const int len = segment.end() - segment.begin(); + constexpr int int_dim = static_cast(DIM); + + const int thread_idx = expt::get_dim(ctx.thread_id); + const int stride = expt::get_dim(ctx.block_dim); + + for (int i = thread_idx; i < len; i += stride) + { + body(*(segment.begin() + i)); + } + } +}; + /* HIP generic loop implementations */ @@ -249,9 +337,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -273,9 +361,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -303,9 +391,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -331,9 +419,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -359,9 +447,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -395,9 +483,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -431,9 +519,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -461,9 +549,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -506,9 +594,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -557,9 +645,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -581,9 +669,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -611,9 +699,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -639,9 +727,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -667,9 +755,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -703,9 +791,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -739,9 +827,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -769,9 +857,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -814,9 +902,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -878,9 +966,9 @@ struct LoopExecute::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -910,9 +998,9 @@ struct LoopExecute::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -952,9 +1040,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -989,9 +1077,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1040,9 +1128,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1077,9 +1165,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1115,9 +1203,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1141,9 +1229,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1176,9 +1264,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1211,9 +1299,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1241,9 +1329,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1282,9 +1370,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1325,9 +1413,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1358,9 +1446,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1408,9 +1496,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1466,9 +1554,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1492,9 +1580,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1528,9 +1616,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1564,9 +1652,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1594,9 +1682,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1636,9 +1724,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1680,9 +1768,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1713,9 +1801,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1767,9 +1855,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp index 18e0cde171..ab877e3e9b 100644 --- a/include/RAJA/policy/openmp/launch.hpp +++ b/include/RAJA/policy/openmp/launch.hpp @@ -30,7 +30,6 @@ template<> struct LaunchExecute { - template static concepts::enable_if_t< resources::EventProxy, @@ -47,11 +46,15 @@ struct LaunchExecute EXEC_POL pol {}; using BodyType = decltype(thread_privatize(body)); + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; + auto parallel_section = [&](ReduceParams& f_params, auto func) { - LaunchContext ctx; + LaunchContextType ctx; + auto loop_body = thread_privatize(body); static_assert(std::is_invocable::value, + LaunchContextType&>::value, "Internal RAJA error: Check the parallel kernel passed to " "OpenMP Parallel section in openmp/launch.hpp"); @@ -76,7 +79,7 @@ struct LaunchExecute // pragma so that the reduction parameter pack it operates on is the // version tracked by the combine OpenMP syntax auto parallel_kernel = [&](ReduceParams& f_params, BodyType& body, - LaunchContext& ctx) { + LaunchContextType& ctx) { expt::invoke_body(f_params, body.get_priv(), ctx); }; parallel_section(f_params, parallel_kernel); @@ -86,7 +89,7 @@ struct LaunchExecute { RAJA::region([&]() { auto parallel_kernel = [&](ReduceParams&, BodyType& body, - LaunchContext& ctx) { + LaunchContextType& ctx) { body.get_priv()(ctx); }; parallel_section(f_params, parallel_kernel); @@ -103,9 +106,9 @@ template struct LoopExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -123,9 +126,9 @@ struct LoopExecute }); } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -151,9 +154,9 @@ struct LoopExecute }); } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -189,9 +192,9 @@ template struct LoopExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -205,9 +208,9 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -227,9 +230,9 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -262,9 +265,9 @@ template struct LoopICountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -278,9 +281,9 @@ struct LoopICountExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -300,9 +303,9 @@ struct LoopICountExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -335,9 +338,9 @@ template struct LoopExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -363,9 +366,9 @@ struct LoopExecute }); } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -402,9 +405,9 @@ template struct LoopICountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -430,9 +433,9 @@ struct LoopICountExecute }); } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -468,9 +471,9 @@ template struct TileExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -495,9 +498,9 @@ template struct TileTCountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -524,9 +527,9 @@ template struct TileExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -545,9 +548,9 @@ template struct TileTCountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp index c63292dd2b..bb3e0913e4 100644 --- a/include/RAJA/policy/sequential/launch.hpp +++ b/include/RAJA/policy/sequential/launch.hpp @@ -59,7 +59,10 @@ struct LaunchExecute expt::ParamMultiplexer::parampack_init(pol, launch_reducers); } - LaunchContext ctx; + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; + + LaunchContextType ctx; char* kernel_local_mem = new char[launch_params.shared_mem_size]; ctx.shared_mem_ptr = kernel_local_mem; @@ -101,9 +104,9 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -115,9 +118,9 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -137,9 +140,9 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -169,9 +172,9 @@ template struct LoopICountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -182,9 +185,9 @@ struct LoopICountExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -204,9 +207,9 @@ struct LoopICountExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -238,9 +241,9 @@ template struct TileExecute { - template + template static RAJA_HOST_DEVICE RAJA_INLINE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -259,9 +262,9 @@ template struct TileTCountExecute { - template + template static RAJA_HOST_DEVICE RAJA_INLINE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body)