From cd5065ee6eaeef0d372b3f5f12a7e8f51b360b35 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Tue, 25 Nov 2025 05:59:17 -0800 Subject: [PATCH 01/29] initial commit for launch loop optimization --- include/RAJA/pattern/launch/launch_core.hpp | 17 +++++++++++++++-- include/RAJA/policy/hip/launch.hpp | 4 +++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index bfce94057c..13fe663b61 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -185,15 +185,28 @@ class LaunchContext void* shared_mem_ptr; + const size_t thread_dim[3]; + const size_t block_dim[3]; + #if defined(RAJA_ENABLE_SYCL) mutable ::sycl::nd_item<3>* itm; #endif - RAJA_HOST_DEVICE LaunchContext() + RAJA_HOST_DEVICE LaunchContext() : shared_mem_offset(0), - shared_mem_ptr(nullptr) + shared_mem_ptr(nullptr), + thread_dim{1, 1, 1}, + block_dim{1, 1, 1} {} + RAJA_HOST_DEVICE LaunchContext(const size_t tx, const size_t ty, const size_t tz, + const size_t bx, const size_t by, const size_t bz) + : shared_mem_offset(0), + shared_mem_ptr(nullptr), + thread_dim{tx, ty, tz}, + block_dim{bx, by, bz} + {} + // TODO handle alignment template RAJA_HOST_DEVICE T* getSharedMemory(size_t bytes) diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index f3ae8f87c1..0800cc4591 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -32,7 +32,9 @@ template __global__ void launch_new_reduce_global_fcn(const BODY body_in, ReduceParams reduce_params) { - LaunchContext ctx; + LaunchContext ctx(blockDim.x, blockDim.y, blockDim.z, + gridDim.x, gridDim.y, gridDim.z); + using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); From 484ff1a1e24eed231fc469530adf36549e8c4d22 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Tue, 25 Nov 2025 07:23:22 -0800 Subject: [PATCH 02/29] add structs to store gpu thread/block info in launch ctx --- include/RAJA/pattern/launch/launch_core.hpp | 6 +-- include/RAJA/policy/hip/launch.hpp | 42 +++++++++++++++++++-- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index 13fe663b61..db4ca50f47 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -185,7 +185,7 @@ class LaunchContext void* shared_mem_ptr; - const size_t thread_dim[3]; + const size_t thread_id[3]; const size_t block_dim[3]; #if defined(RAJA_ENABLE_SYCL) @@ -195,7 +195,7 @@ class LaunchContext RAJA_HOST_DEVICE LaunchContext() : shared_mem_offset(0), shared_mem_ptr(nullptr), - thread_dim{1, 1, 1}, + thread_id{1, 1, 1}, block_dim{1, 1, 1} {} @@ -203,7 +203,7 @@ class LaunchContext const size_t bx, const size_t by, const size_t bz) : shared_mem_offset(0), shared_mem_ptr(nullptr), - thread_dim{tx, ty, tz}, + thread_id{tx, ty, tz}, block_dim{bx, by, bz} {} diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index 0800cc4591..49c9a03546 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -32,9 +32,8 @@ template __global__ void launch_new_reduce_global_fcn(const BODY body_in, ReduceParams reduce_params) { - LaunchContext ctx(blockDim.x, blockDim.y, blockDim.z, - gridDim.x, gridDim.y, gridDim.z); - + LaunchContext ctx(threadIdx.x, threadIdx.y, threadIdx.z, + blockDim.x, blockDim.y, blockDim.z); using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); @@ -139,7 +138,8 @@ __launch_bounds__(num_threads, 1) __global__ void launch_new_reduce_global_fcn_fixed(const BODY body_in, ReduceParams reduce_params) { - LaunchContext ctx; + LaunchContext ctx(threadIdx.x, threadIdx.y, threadIdx.z, + blockDim.x, blockDim.y, blockDim.z); using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); @@ -241,6 +241,40 @@ struct LaunchExecute> } }; +template +struct hip_ctx_thread_loop; + +using hip_ctx_thread_loop_x = hip_ctx_thread_loop; +using hip_ctx_thread_loop_y = hip_ctx_thread_loop; +using hip_ctx_thread_loop_z = hip_ctx_thread_loop; + +template +struct LoopExecute, SEGMENT> +{ + + template + static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx, + SEGMENT const& segment, + BODY const& body) + { + + const int len = segment.end() - segment.begin(); + constexpr int int_dim = static_cast(DIM); + + //for(int i=::RAJA::internal::HipDimHelper::get(threadIdx); + for(int i = ctx.thread_id[int_dim]; + i < len; + i+=ctx.block_dim[int_dim]) + //i+=4) + { + body(*(segment.begin() + i)); + } + + } +}; + + + /* HIP generic loop implementations */ From 18f332b7b3ab7e6f395a60627ec8465a63e78182 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Tue, 2 Dec 2025 09:05:26 -0800 Subject: [PATCH 03/29] add cuda variant and add build guards for cpu --- include/RAJA/pattern/launch/launch_core.hpp | 23 +++++++------ include/RAJA/policy/cuda/launch.hpp | 38 +++++++++++++++++++-- include/RAJA/policy/hip/launch.hpp | 26 +++++++------- 3 files changed, 60 insertions(+), 27 deletions(-) diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index db4ca50f47..4883204a9e 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -185,27 +185,28 @@ class LaunchContext void* shared_mem_ptr; - const size_t thread_id[3]; - const size_t block_dim[3]; +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + const dim3 thread_id; + const dim3 block_dim; +#endif #if defined(RAJA_ENABLE_SYCL) mutable ::sycl::nd_item<3>* itm; #endif - RAJA_HOST_DEVICE LaunchContext() + RAJA_HOST_DEVICE LaunchContext() : shared_mem_offset(0), - shared_mem_ptr(nullptr), - thread_id{1, 1, 1}, - block_dim{1, 1, 1} + shared_mem_ptr(nullptr) {} - RAJA_HOST_DEVICE LaunchContext(const size_t tx, const size_t ty, const size_t tz, - const size_t bx, const size_t by, const size_t bz) +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + RAJA_HOST_DEVICE LaunchContext(dim3 thread_id_, dim3 block_id_) : shared_mem_offset(0), shared_mem_ptr(nullptr), - thread_id{tx, ty, tz}, - block_dim{bx, by, bz} - {} + thread_id {thread_id_}, + block_dim {block_id_} + {} +#endif // TODO handle alignment template diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index d9cca09216..619a88305c 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -33,7 +33,7 @@ __global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY body_in, ReduceParams reduce_params) { - LaunchContext ctx; + LaunchContext ctx(threadIdx, blockDim); using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); @@ -143,7 +143,7 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ body_in, ReduceParams reduce_params) { - LaunchContext ctx; + LaunchContext ctx(threadIdx, blockDim); using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); @@ -245,6 +245,40 @@ struct LaunchExecute< } }; +/* + Loop methods which rely on a copy of threaIdx/BlockDim + for performance. In collaboration with AMD we have have this + to be more performat. +*/ + +template +struct hip_ctx_thread_loop; + +using hip_ctx_thread_loop_x = hip_ctx_thread_loop; +using hip_ctx_thread_loop_y = hip_ctx_thread_loop; +using hip_ctx_thread_loop_z = hip_ctx_thread_loop; + +template +struct LoopExecute, SEGMENT> +{ + + template + static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx, + SEGMENT const& segment, + BODY const& body) + { + + const int len = segment.end() - segment.begin(); + constexpr int int_dim = static_cast(DIM); + + for (int i = ::RAJA::internal::HipDimHelper::get(ctx.thread_id); + i < len; i += ::RAJA::internal::HipDimHelper::get(ctx.block_dim)) + { + body(*(segment.begin() + i)); + } + } +}; + /* CUDA generic loop implementations */ diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index 49c9a03546..39c3661439 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -32,8 +32,7 @@ template __global__ void launch_new_reduce_global_fcn(const BODY body_in, ReduceParams reduce_params) { - LaunchContext ctx(threadIdx.x, threadIdx.y, threadIdx.z, - blockDim.x, blockDim.y, blockDim.z); + LaunchContext ctx(threadIdx, blockDim); using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); @@ -138,8 +137,7 @@ __launch_bounds__(num_threads, 1) __global__ void launch_new_reduce_global_fcn_fixed(const BODY body_in, ReduceParams reduce_params) { - LaunchContext ctx(threadIdx.x, threadIdx.y, threadIdx.z, - blockDim.x, blockDim.y, blockDim.z); + LaunchContext ctx(threadIdx, blockDim); using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); @@ -241,6 +239,12 @@ struct LaunchExecute> } }; +/* + Loop methods which rely on a copy of threaIdx/BlockDim + for performance. In collaboration with AMD we have have this + to be more performant. +*/ + template struct hip_ctx_thread_loop; @@ -258,23 +262,17 @@ struct LoopExecute, SEGMENT> BODY const& body) { - const int len = segment.end() - segment.begin(); + const int len = segment.end() - segment.begin(); constexpr int int_dim = static_cast(DIM); - //for(int i=::RAJA::internal::HipDimHelper::get(threadIdx); - for(int i = ctx.thread_id[int_dim]; - i < len; - i+=ctx.block_dim[int_dim]) - //i+=4) + for (int i = ::RAJA::internal::HipDimHelper::get(ctx.thread_id); + i < len; i += ::RAJA::internal::HipDimHelper::get(ctx.block_dim)) { - body(*(segment.begin() + i)); + body(*(segment.begin() + i)); } - } }; - - /* HIP generic loop implementations */ From 73f224afe660743e3f1e16ed188470c556a47be6 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 11 Dec 2025 05:36:57 -0800 Subject: [PATCH 04/29] rework to support dim3 copy in ctx --- include/RAJA/pattern/launch/launch_core.hpp | 98 ++++++++- include/RAJA/policy/hip/launch.hpp | 210 ++++++++++---------- include/RAJA/policy/hip/policy.hpp | 2 +- 3 files changed, 204 insertions(+), 106 deletions(-) diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index 4883204a9e..9425c3c19d 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -176,37 +176,65 @@ struct LaunchParams Threads apply(Threads const& a) { return (threads = a); } }; -class LaunchContext +template +class LaunchContextT { public: + +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + // If StoreDim3 is true, store by value; else, don't store + typename std::conditional::type thread_id; + typename std::conditional::type block_dim; +#endif + // Bump style allocator used to // get memory from the pool size_t shared_mem_offset; - void* shared_mem_ptr; +/* #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) const dim3 thread_id; const dim3 block_dim; #endif +*/ #if defined(RAJA_ENABLE_SYCL) mutable ::sycl::nd_item<3>* itm; #endif - RAJA_HOST_DEVICE LaunchContext() + RAJA_HOST_DEVICE LaunchContextT() : shared_mem_offset(0), shared_mem_ptr(nullptr) {} + // Only enable this constructor if StoreDim3 is true + template ::type = 0> + RAJA_HOST_DEVICE LaunchContextT(dim3 thread_id_, dim3 block_id_) + : shared_mem_offset(0), + shared_mem_ptr(nullptr), + thread_id(thread_id_), + block_dim(block_id_) + {} + + // Only enable this constructor if StoreDim3 is false + template ::type = 0> + RAJA_HOST_DEVICE LaunchContextT(dim3 thread_id_, dim3 block_id_) + : shared_mem_offset(0), + shared_mem_ptr(nullptr) + {} + +/* #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) - RAJA_HOST_DEVICE LaunchContext(dim3 thread_id_, dim3 block_id_) + RAJA_HOST_DEVICE LaunchContextT(dim3 thread_id_, dim3 block_id_) : shared_mem_offset(0), shared_mem_ptr(nullptr), thread_id {thread_id_}, block_dim {block_id_} {} #endif +*/ + // TODO handle alignment template @@ -255,6 +283,39 @@ class LaunchContext } }; +using LaunchContext = LaunchContextT; + +/* +template +class LaunchContextT : public LaunchContext +{ + +public: + +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + // If StoreDim3 is true, store by value; else, don't store + typename std::conditional::type thread_id; + typename std::conditional::type block_dim; +#endif + + // Only enable this constructor if StoreDim3 is true + template ::type = 0> + RAJA_HOST_DEVICE LaunchContextT(dim3 thread_id_, dim3 block_id_) + : LaunchContext(), + thread_id(thread_id_), + block_dim(block_id_) + {} + + // Only enable this constructor if StoreDim3 is false + template ::type = 0> + RAJA_HOST_DEVICE LaunchContextT(dim3 thread_id_, dim3 block_id_) + : LaunchContext() + {} + +}; +*/ + + template struct LaunchExecute; @@ -478,6 +539,7 @@ struct LoopExecute; template struct LoopICountExecute; + RAJA_SUPPRESS_HD_WARN template, SEGMENT>::exec(ctx, segment, body); } + +/* +template +RAJA_HOST_DEVICE RAJA_INLINE void loop(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body) +{ + LoopExecute, SEGMENT>::template exec(ctx, segment, body); +} +*/ + +/* +template +RAJA_HOST_DEVICE RAJA_INLINE void loop(LaunchContextT const& ctx, SEGMENT const& segment, BODY const& body) +{ + LoopExecute, SEGMENT>::template exec(ctx, segment, body); +} +*/ + + +/* +// Overload for other contexts +template +std::enable_if_t::value> +loop(CONTEXT const& ctx, SEGMENT const& segment, BODY const& body) +{ + LoopExecute, SEGMENT>::template exec(ctx, segment, body); +} +*/ + template +template __global__ void launch_new_reduce_global_fcn(const BODY body_in, ReduceParams reduce_params) { - LaunchContext ctx(threadIdx, blockDim); + LaunchContextT ctx(threadIdx, blockDim); using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); @@ -49,9 +49,9 @@ __global__ void launch_new_reduce_global_fcn(const BODY body_in, RAJA::hip_flatten_global_xyz_direct {}, reduce_params); } -template +template struct LaunchExecute< - RAJA::policy::hip::hip_launch_t> + RAJA::policy::hip::hip_launch_t> { template @@ -69,7 +69,7 @@ struct LaunchExecute< EXEC_POL pol {}; auto func = reinterpret_cast( - &launch_new_reduce_global_fcn>); + &launch_new_reduce_global_fcn>); resources::Hip hip_res = res.get(); @@ -132,12 +132,13 @@ struct LaunchExecute< } }; -template +template __launch_bounds__(num_threads, 1) __global__ void launch_new_reduce_global_fcn_fixed(const BODY body_in, ReduceParams reduce_params) { - LaunchContext ctx(threadIdx, blockDim); + + LaunchContextT ctx(threadIdx, blockDim); using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); @@ -152,10 +153,11 @@ __launch_bounds__(num_threads, 1) __global__ // Using a flatten global policy as we may use all dimensions RAJA::expt::ParamMultiplexer::parampack_combine( RAJA::hip_flatten_global_xyz_direct {}, reduce_params); + } -template -struct LaunchExecute> +template +struct LaunchExecute> { template @@ -175,7 +177,7 @@ struct LaunchExecute> EXEC_POL pol {}; auto func = reinterpret_cast( - &launch_new_reduce_global_fcn_fixed>); resources::Hip hip_res = res.get(); @@ -256,20 +258,26 @@ template struct LoopExecute, SEGMENT> { - template - static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx, +template + static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT const& ctx, SEGMENT const& segment, BODY const& body) { + //static_assert(hasDim3 == true, "Must use device policy that stored dim3 info"); + const int len = segment.end() - segment.begin(); constexpr int int_dim = static_cast(DIM); - for (int i = ::RAJA::internal::HipDimHelper::get(ctx.thread_id); - i < len; i += ::RAJA::internal::HipDimHelper::get(ctx.block_dim)) - { - body(*(segment.begin() + i)); - } + //if constexpr (hasDim3) { + + for (int i = ::RAJA::internal::HipDimHelper::get(ctx.thread_id); + i < len; i += ::RAJA::internal::HipDimHelper::get(ctx.block_dim)) + { + body(*(segment.begin() + i)); + } + //} + } }; @@ -287,9 +295,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template +template //need to finish static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -311,9 +319,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -341,9 +349,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -369,9 +377,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -397,9 +405,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -433,9 +441,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -469,9 +477,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -499,9 +507,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -544,9 +552,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -595,9 +603,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -619,9 +627,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -649,9 +657,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -677,9 +685,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -705,9 +713,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -741,9 +749,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -777,9 +785,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -807,9 +815,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -852,9 +860,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -916,9 +924,9 @@ struct LoopExecute::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -948,9 +956,9 @@ struct LoopExecute::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -990,9 +998,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1027,9 +1035,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1078,9 +1086,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1115,9 +1123,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1153,9 +1161,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1179,9 +1187,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1214,9 +1222,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1249,9 +1257,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1279,9 +1287,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1320,9 +1328,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1363,9 +1371,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1396,9 +1404,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1446,9 +1454,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1504,9 +1512,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1530,9 +1538,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1566,9 +1574,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1602,9 +1610,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1632,9 +1640,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1674,9 +1682,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1718,9 +1726,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1751,9 +1759,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1805,9 +1813,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index e7c3ca0672..5914b0cdfb 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -358,7 +358,7 @@ struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t< using LaunchConcretizer = _LaunchConcretizer; }; -template +template struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::hip, RAJA::Pattern::region, From 672889ee9e74d3e45f06bc74a05115fd69c88bf9 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 11 Dec 2025 07:07:53 -0800 Subject: [PATCH 05/29] make format --- include/RAJA/pattern/launch/launch_core.hpp | 44 +++++++++++---------- include/RAJA/policy/cuda/launch.hpp | 4 +- include/RAJA/policy/hip/launch.hpp | 29 +++++++------- include/RAJA/policy/hip/policy.hpp | 4 +- 4 files changed, 44 insertions(+), 37 deletions(-) diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index 9236c46de0..aec60c6de8 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -176,11 +176,10 @@ struct LaunchParams Threads apply(Threads const& a) { return (threads = a); } }; -template +template class LaunchContextT { public: - #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) // If StoreDim3 is true, store by value; else, don't store typename std::conditional::type thread_id; @@ -202,21 +201,23 @@ class LaunchContextT shared_mem_ptr(nullptr) {} +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) // Only enable this constructor if StoreDim3 is true - template ::type = 0> + template::type = 0> RAJA_HOST_DEVICE LaunchContextT(dim3 thread_id_, dim3 block_id_) - : shared_mem_offset(0), - shared_mem_ptr(nullptr), - thread_id(thread_id_), - block_dim(block_id_) + : shared_mem_offset(0), + shared_mem_ptr(nullptr), + thread_id(thread_id_), + block_dim(block_id_) {} // Only enable this constructor if StoreDim3 is false - template ::type = 0> + template::type = 0> RAJA_HOST_DEVICE LaunchContextT(dim3 thread_id_, dim3 block_id_) - : shared_mem_offset(0), - shared_mem_ptr(nullptr) + : shared_mem_offset(0), + shared_mem_ptr(nullptr) {} +#endif // TODO handle alignment template @@ -492,7 +493,6 @@ struct LoopExecute; template struct LoopICountExecute; - RAJA_SUPPRESS_HD_WARN template, SEGMENT>::exec(ctx, segment, body); } - /* template -RAJA_HOST_DEVICE RAJA_INLINE void loop(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body) +RAJA_HOST_DEVICE RAJA_INLINE void loop(LaunchContext const& ctx, SEGMENT const& +segment, BODY const& body) { - LoopExecute, SEGMENT>::template exec(ctx, segment, body); + LoopExecute, SEGMENT>::template exec(ctx, +segment, body); } */ /* template -RAJA_HOST_DEVICE RAJA_INLINE void loop(LaunchContextT const& ctx, SEGMENT const& segment, BODY const& body) +RAJA_HOST_DEVICE RAJA_INLINE void loop(LaunchContextT const& ctx, SEGMENT +const& segment, BODY const& body) { - LoopExecute, SEGMENT>::template exec(ctx, segment, body); + LoopExecute, SEGMENT>::template exec(ctx, +segment, body); } */ /* // Overload for other contexts -template -std::enable_if_t::value> -loop(CONTEXT const& ctx, SEGMENT const& segment, BODY const& body) +template std::enable_if_t::value> loop(CONTEXT const& +ctx, SEGMENT const& segment, BODY const& body) { - LoopExecute, SEGMENT>::template exec(ctx, segment, body); + LoopExecute, SEGMENT>::template exec(ctx, +segment, body); } */ diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index 24c2cc8e62..7af04eddc8 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -261,10 +261,10 @@ using cuda_ctx_thread_loop_x = cuda_ctx_thread_loop; using cuda_ctx_thread_loop_y = cuda_ctx_thread_loop; using cuda_ctx_thread_loop_z = cuda_ctx_thread_loop; -} +} // namespace expt template -struct LoopExecute, SEGMENT> +struct LoopExecute, SEGMENT> { template diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index b59d342ae5..72ec4b3d6d 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -51,7 +51,7 @@ __global__ void launch_new_reduce_global_fcn(const BODY body_in, template struct LaunchExecute< - RAJA::policy::hip::hip_launch_t> + RAJA::policy::hip::hip_launch_t> { template @@ -69,7 +69,8 @@ struct LaunchExecute< EXEC_POL pol {}; auto func = reinterpret_cast( - &launch_new_reduce_global_fcn>); + &launch_new_reduce_global_fcn>); resources::Hip hip_res = res.get(); @@ -153,11 +154,11 @@ __launch_bounds__(num_threads, 1) __global__ // Using a flatten global policy as we may use all dimensions RAJA::expt::ParamMultiplexer::parampack_combine( RAJA::hip_flatten_global_xyz_direct {}, reduce_params); - } template -struct LaunchExecute> +struct LaunchExecute< + RAJA::policy::hip::hip_launch_t> { template @@ -177,7 +178,7 @@ struct LaunchExecute EXEC_POL pol {}; auto func = reinterpret_cast( - &launch_new_reduce_global_fcn_fixed>); resources::Hip hip_res = res.get(); @@ -257,7 +258,7 @@ using hip_ctx_thread_loop_x = hip_ctx_thread_loop; using hip_ctx_thread_loop_y = hip_ctx_thread_loop; using hip_ctx_thread_loop_z = hip_ctx_thread_loop; -} +} // namespace expt /* Loop exec methods will have to be reworked to be hasDim3 aware @@ -267,7 +268,7 @@ template struct LoopExecute, SEGMENT> { -template + template static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT const& ctx, SEGMENT const& segment, BODY const& body) @@ -276,11 +277,11 @@ template const int len = segment.end() - segment.begin(); constexpr int int_dim = static_cast(DIM); - for (int i = ::RAJA::internal::HipDimHelper::get(ctx.thread_id); - i < len; i += ::RAJA::internal::HipDimHelper::get(ctx.block_dim)) - { - body(*(segment.begin() + i)); - } + for (int i = ::RAJA::internal::HipDimHelper::get(ctx.thread_id); + i < len; i += ::RAJA::internal::HipDimHelper::get(ctx.block_dim)) + { + body(*(segment.begin() + i)); + } } }; @@ -298,7 +299,7 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; -template //need to finish + template // need to finish static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, @@ -482,7 +483,7 @@ struct LoopExecute< template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 5914b0cdfb..c9db8a21a1 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -358,7 +358,9 @@ struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t< using LaunchConcretizer = _LaunchConcretizer; }; -template +template struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::hip, RAJA::Pattern::region, From 5908a20c6b722b1b9386a3e23a09e6dfb1e9094a Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 11 Dec 2025 09:05:16 -0800 Subject: [PATCH 06/29] Update include/RAJA/pattern/launch/launch_core.hpp Co-authored-by: Jason Burmark --- include/RAJA/pattern/launch/launch_core.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index aec60c6de8..bcb797553d 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -203,7 +203,7 @@ class LaunchContextT #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) // Only enable this constructor if StoreDim3 is true - template::type = 0> + template* = nullptr> RAJA_HOST_DEVICE LaunchContextT(dim3 thread_id_, dim3 block_id_) : shared_mem_offset(0), shared_mem_ptr(nullptr), From 4d9f800ffb44abb299556b6c57d70c136e76125e Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 18 Dec 2025 06:08:34 -0800 Subject: [PATCH 07/29] clean up pass --- .../pattern/launch/launch_context_policy.hpp | 31 +++ include/RAJA/pattern/launch/launch_core.hpp | 86 +++---- include/RAJA/policy/hip/launch.hpp | 211 +++++++++--------- include/RAJA/policy/hip/policy.hpp | 3 +- 4 files changed, 188 insertions(+), 143 deletions(-) create mode 100644 include/RAJA/pattern/launch/launch_context_policy.hpp diff --git a/include/RAJA/pattern/launch/launch_context_policy.hpp b/include/RAJA/pattern/launch/launch_context_policy.hpp new file mode 100644 index 0000000000..c50a5484a8 --- /dev/null +++ b/include/RAJA/pattern/launch/launch_context_policy.hpp @@ -0,0 +1,31 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file containing the core components of RAJA::launch + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_pattern_context_policy_HPP +#define RAJA_pattern_context_policy_HPP + +namespace RAJA +{ + +class LaunchContextDefaultPolicy; + +#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) +class LaunchContextDim3Policy; +#endif + +} // namespace RAJA +#endif diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index aec60c6de8..b6a17a27a7 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -20,6 +20,7 @@ #include "RAJA/config.hpp" #include "RAJA/internal/get_platform.hpp" +#include "RAJA/pattern/launch/launch_context_policy.hpp" #include "RAJA/util/StaticLayout.hpp" #include "RAJA/util/macros.hpp" #include "RAJA/util/plugins.hpp" @@ -176,16 +177,9 @@ struct LaunchParams Threads apply(Threads const& a) { return (threads = a); } }; -template -class LaunchContextT +class LaunchContextBase { public: -#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) - // If StoreDim3 is true, store by value; else, don't store - typename std::conditional::type thread_id; - typename std::conditional::type block_dim; -#endif - // Bump style allocator used to // get memory from the pool size_t shared_mem_offset; @@ -196,28 +190,10 @@ class LaunchContextT mutable ::sycl::nd_item<3>* itm; #endif - RAJA_HOST_DEVICE LaunchContextT() - : shared_mem_offset(0), - shared_mem_ptr(nullptr) - {} - -#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) - // Only enable this constructor if StoreDim3 is true - template::type = 0> - RAJA_HOST_DEVICE LaunchContextT(dim3 thread_id_, dim3 block_id_) - : shared_mem_offset(0), - shared_mem_ptr(nullptr), - thread_id(thread_id_), - block_dim(block_id_) - {} - - // Only enable this constructor if StoreDim3 is false - template::type = 0> - RAJA_HOST_DEVICE LaunchContextT(dim3 thread_id_, dim3 block_id_) + RAJA_HOST_DEVICE LaunchContextBase() : shared_mem_offset(0), shared_mem_ptr(nullptr) {} -#endif // TODO handle alignment template @@ -233,20 +209,6 @@ class LaunchContextT return static_cast(mem_ptr); } - /* - //Odd dependecy with atomics is breaking CI builds - template RAJA_HOST_DEVICE auto - getSharedMemoryView(size_t bytes, arg idx, args... idxs) - { - T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset]; - - shared_mem_offset += bytes*sizeof(T); - return RAJA::View>(mem_ptr, idx, - idxs...); - } - */ - RAJA_HOST_DEVICE void releaseSharedMemory() { // On the cpu/gpu we want to restart the count @@ -267,8 +229,48 @@ class LaunchContextT } }; +template +class LaunchContextT; + +template<> +class LaunchContextT : public LaunchContextBase +{ +public: + +static constexpr bool hasDim3 = false; + +using LaunchContextBase::LaunchContextBase; + +}; + // Preserve backwards compatibility -using LaunchContext = LaunchContextT; +using LaunchContext = LaunchContextT; + +template <> +class LaunchContextT : public LaunchContextBase +{ +public: + + static constexpr bool hasDim3 = true; + + dim3 thread_id; + dim3 block_dim; + + RAJA_HOST_DEVICE + LaunchContextT() + : LaunchContextBase() + , thread_id() + , block_dim() + {} + + RAJA_HOST_DEVICE + LaunchContextT(dim3 thread, dim3 block) + : LaunchContextBase() + , thread_id(thread) + , block_dim(block) + {} + +}; template struct LaunchExecute; diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index 72ec4b3d6d..0fdd75fe3c 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -28,11 +28,10 @@ namespace RAJA { -template +template __global__ void launch_new_reduce_global_fcn(const BODY body_in, ReduceParams reduce_params) { - LaunchContextT ctx(threadIdx, blockDim); using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); @@ -40,18 +39,25 @@ __global__ void launch_new_reduce_global_fcn(const BODY body_in, // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - ctx.shared_mem_ptr = raja_shmem_ptr; - RAJA::expt::invoke_body(reduce_params, body, ctx); + if constexpr(LaunchContextT::hasDim3) { + LaunchContextT ctx(threadIdx, blockDim); + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } else { + LaunchContextT ctx; + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } // Using a flatten global policy as we may use all dimensions RAJA::expt::ParamMultiplexer::parampack_combine( RAJA::hip_flatten_global_xyz_direct {}, reduce_params); } -template +template struct LaunchExecute< - RAJA::policy::hip::hip_launch_t> + RAJA::policy::hip::hip_launch_t> { template @@ -69,7 +75,7 @@ struct LaunchExecute< EXEC_POL pol {}; auto func = reinterpret_cast( - &launch_new_reduce_global_fcn>); resources::Hip hip_res = res.get(); @@ -133,32 +139,37 @@ struct LaunchExecute< } }; -template +template __launch_bounds__(num_threads, 1) __global__ void launch_new_reduce_global_fcn_fixed(const BODY body_in, ReduceParams reduce_params) { - LaunchContextT ctx(threadIdx, blockDim); - using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); auto& body = privatizer.get_priv(); // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - ctx.shared_mem_ptr = raja_shmem_ptr; - RAJA::expt::invoke_body(reduce_params, body, ctx); + if constexpr(LaunchContextT::hasDim3) { + LaunchContextT ctx(threadIdx, blockDim); + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } else { + LaunchContextT ctx; + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } // Using a flatten global policy as we may use all dimensions RAJA::expt::ParamMultiplexer::parampack_combine( RAJA::hip_flatten_global_xyz_direct {}, reduce_params); } -template +template struct LaunchExecute< - RAJA::policy::hip::hip_launch_t> + RAJA::policy::hip::hip_launch_t> { template @@ -178,7 +189,7 @@ struct LaunchExecute< EXEC_POL pol {}; auto func = reinterpret_cast( - &launch_new_reduce_global_fcn_fixed>); resources::Hip hip_res = res.get(); @@ -269,7 +280,7 @@ struct LoopExecute, SEGMENT> { template - static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT const& ctx, + static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT const& ctx, SEGMENT const& segment, BODY const& body) { @@ -299,9 +310,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template // need to finish + template // need to finish static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -323,9 +334,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -353,9 +364,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -381,9 +392,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -409,9 +420,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -445,9 +456,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -481,9 +492,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -511,9 +522,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -556,9 +567,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -607,9 +618,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -631,9 +642,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -661,9 +672,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -689,9 +700,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -717,9 +728,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -753,9 +764,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -789,9 +800,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -819,9 +830,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -864,9 +875,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -928,9 +939,9 @@ struct LoopExecute::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -960,9 +971,9 @@ struct LoopExecute::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1002,9 +1013,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1039,9 +1050,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1090,9 +1101,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1127,9 +1138,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1165,9 +1176,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1191,9 +1202,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1226,9 +1237,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1261,9 +1272,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1291,9 +1302,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1332,9 +1343,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1375,9 +1386,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1408,9 +1419,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1458,9 +1469,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1516,9 +1527,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1542,9 +1553,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1578,9 +1589,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1614,9 +1625,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1644,9 +1655,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1686,9 +1697,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1730,9 +1741,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1763,9 +1774,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1817,9 +1828,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContextT const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index c9db8a21a1..0a94af3d8d 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -26,6 +26,7 @@ #include "hip/hip_runtime.h" #include "RAJA/pattern/reduce.hpp" +#include "RAJA/pattern/launch/launch_context_policy.hpp" #include "RAJA/policy/PolicyBase.hpp" #include "RAJA/policy/sequential/policy.hpp" @@ -360,7 +361,7 @@ struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t< template + typename LaunchContextPolicy = LaunchContextDefaultPolicy> struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::hip, RAJA::Pattern::region, From 85aef5afb4ef5e7e877a5424d7a83561f23afe11 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 18 Dec 2025 06:23:00 -0800 Subject: [PATCH 08/29] fix build error --- include/RAJA/pattern/launch/launch_core.hpp | 34 ++------------------- scripts/radiuss-spack-configs | 2 +- scripts/uberenv | 2 +- 3 files changed, 4 insertions(+), 34 deletions(-) diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index b6a17a27a7..a936d8d564 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -246,6 +246,7 @@ using LaunchContextBase::LaunchContextBase; // Preserve backwards compatibility using LaunchContext = LaunchContextT; +#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) template <> class LaunchContextT : public LaunchContextBase { @@ -271,6 +272,7 @@ class LaunchContextT : public LaunchContextBase {} }; +#endif template struct LaunchExecute; @@ -508,38 +510,6 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const& ctx, LoopExecute, SEGMENT>::exec(ctx, segment, body); } -/* -template -RAJA_HOST_DEVICE RAJA_INLINE void loop(LaunchContext const& ctx, SEGMENT const& -segment, BODY const& body) -{ - LoopExecute, SEGMENT>::template exec(ctx, -segment, body); -} -*/ - -/* -template -RAJA_HOST_DEVICE RAJA_INLINE void loop(LaunchContextT const& ctx, SEGMENT -const& segment, BODY const& body) -{ - LoopExecute, SEGMENT>::template exec(ctx, -segment, body); -} -*/ - - -/* -// Overload for other contexts -template std::enable_if_t::value> loop(CONTEXT const& -ctx, SEGMENT const& segment, BODY const& body) -{ - LoopExecute, SEGMENT>::template exec(ctx, -segment, body); -} -*/ - template Date: Thu, 18 Dec 2025 06:30:28 -0800 Subject: [PATCH 09/29] take develop submodule --- scripts/radiuss-spack-configs | 2 +- scripts/uberenv | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 9961a46864..fddc4f16ee 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 9961a46864f0e0b5f42b6f7c401dd3407e319b62 +Subproject commit fddc4f16ee987abc9c1c61879eaf8a2d6a8253d9 diff --git a/scripts/uberenv b/scripts/uberenv index eb18535868..bec05e20bf 160000 --- a/scripts/uberenv +++ b/scripts/uberenv @@ -1 +1 @@ -Subproject commit eb1853586821360613f1c6c415ebf65d370a47e7 +Subproject commit bec05e20bf2a1634d97ead358a9072c36f1fdcac From 4a695f27c28595385f81d95bea65f11d79031255 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 18 Dec 2025 06:44:00 -0800 Subject: [PATCH 10/29] cuda backend --- include/RAJA/policy/cuda/launch.hpp | 45 +++++++++++++++++++---------- include/RAJA/policy/cuda/policy.hpp | 10 +++++-- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index 7af04eddc8..a965c166be 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -28,33 +28,39 @@ namespace RAJA { -template +template __global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY body_in, ReduceParams reduce_params) { - LaunchContext ctx(threadIdx, blockDim); - using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); auto& body = privatizer.get_priv(); // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - ctx.shared_mem_ptr = raja_shmem_ptr; - RAJA::expt::invoke_body(reduce_params, body, ctx); + if constexpr(LaunchContextT::hasDim3) { + LaunchContextT ctx(threadIdx, blockDim); + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } else { + LaunchContextT ctx; + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } // Using a flatten global policy as we may use all dimensions RAJA::expt::ParamMultiplexer::parampack_combine( RAJA::cuda_flatten_global_xyz_direct {}, reduce_params); } -template +template struct LaunchExecute< RAJA::policy::cuda::cuda_launch_explicit_t> + named_usage::unspecified, + LaunchContextPolicy>> { template @@ -72,7 +78,7 @@ struct LaunchExecute< EXEC_POL pol {}; auto func = reinterpret_cast( - &launch_new_reduce_global_fcn>); + &launch_new_reduce_global_fcn>); resources::Cuda cuda_res = res.get(); @@ -137,13 +143,13 @@ struct LaunchExecute< template __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ void launch_new_reduce_global_fcn_fixed(const RAJA_CUDA_GRID_CONSTANT BODY body_in, ReduceParams reduce_params) { - LaunchContext ctx(threadIdx, blockDim); using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); @@ -151,18 +157,25 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - ctx.shared_mem_ptr = raja_shmem_ptr; + if constexpr(LaunchContextT::hasDim3) { + LaunchContextT ctx(threadIdx, blockDim); + ctx.shared_mem_ptr = raja_shmem_ptr; RAJA::expt::invoke_body(reduce_params, body, ctx); + } else { + LaunchContextT ctx; + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } // Using a flatten global policy as we may use all dimensions RAJA::expt::ParamMultiplexer::parampack_combine( RAJA::cuda_flatten_global_xyz_direct {}, reduce_params); } -template +template struct LaunchExecute< - RAJA::policy::cuda::cuda_launch_explicit_t> + RAJA::policy::cuda::cuda_launch_explicit_t> { template @@ -178,11 +191,11 @@ struct LaunchExecute< // Use a generic block size policy here to match that used in // parampack_combine using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t< - async, named_usage::unspecified, named_usage::unspecified>; + async, named_usage::unspecified, named_usage::unspecified>; EXEC_POL pol {}; auto func = reinterpret_cast( - &launch_new_reduce_global_fcn_fixed>); resources::Cuda cuda_res = res.get(); @@ -264,11 +277,11 @@ using cuda_ctx_thread_loop_z = cuda_ctx_thread_loop; } // namespace expt template -struct LoopExecute, SEGMENT> +struct LoopExecute, SEGMENT> { template - static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx, + static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT const& ctx, SEGMENT const& segment, BODY const& body) { diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index d521dede10..0dca597f1d 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -26,6 +26,7 @@ #include #include "RAJA/pattern/reduce.hpp" +#include "RAJA/pattern/launch/launch_context_policy.hpp" #include "RAJA/policy/PolicyBase.hpp" #include "RAJA/policy/sequential/policy.hpp" @@ -362,7 +363,8 @@ struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t< template + size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, + typename LaunchContextPolicy = LaunchContextDefaultPolicy> struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::cuda, @@ -1726,12 +1728,14 @@ using policy::cuda::cuda_synchronize; // policies usable with launch template + size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, + typename LaunchContextPolicy = LaunchContextDefaultPolicy> using cuda_launch_explicit_t = policy::cuda::cuda_launch_explicit_t; // CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads -template +template using cuda_launch_t = policy::cuda::cuda_launch_explicit_t Date: Thu, 18 Dec 2025 06:44:29 -0800 Subject: [PATCH 11/29] make style --- include/RAJA/pattern/launch/launch_core.hpp | 22 +++------- include/RAJA/policy/cuda/launch.hpp | 47 ++++++++++++++------- include/RAJA/policy/cuda/policy.hpp | 17 ++++---- include/RAJA/policy/hip/launch.hpp | 31 +++++++++----- include/RAJA/policy/hip/policy.hpp | 4 +- 5 files changed, 70 insertions(+), 51 deletions(-) diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index a936d8d564..ce58a6f88f 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -236,41 +236,33 @@ template<> class LaunchContextT : public LaunchContextBase { public: + static constexpr bool hasDim3 = false; -static constexpr bool hasDim3 = false; - -using LaunchContextBase::LaunchContextBase; - + using LaunchContextBase::LaunchContextBase; }; // Preserve backwards compatibility using LaunchContext = LaunchContextT; #if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) -template <> +template<> class LaunchContextT : public LaunchContextBase { public: - static constexpr bool hasDim3 = true; dim3 thread_id; dim3 block_dim; RAJA_HOST_DEVICE - LaunchContextT() - : LaunchContextBase() - , thread_id() - , block_dim() - {} + LaunchContextT() : LaunchContextBase(), thread_id(), block_dim() {} RAJA_HOST_DEVICE LaunchContextT(dim3 thread, dim3 block) - : LaunchContextBase() - , thread_id(thread) - , block_dim(block) + : LaunchContextBase(), + thread_id(thread), + block_dim(block) {} - }; #endif diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index a965c166be..cfa1607a3c 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -40,11 +40,14 @@ __global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - if constexpr(LaunchContextT::hasDim3) { + if constexpr (LaunchContextT::hasDim3) + { LaunchContextT ctx(threadIdx, blockDim); ctx.shared_mem_ptr = raja_shmem_ptr; RAJA::expt::invoke_body(reduce_params, body, ctx); - } else { + } + else + { LaunchContextT ctx; ctx.shared_mem_ptr = raja_shmem_ptr; RAJA::expt::invoke_body(reduce_params, body, ctx); @@ -78,7 +81,8 @@ struct LaunchExecute< EXEC_POL pol {}; auto func = reinterpret_cast( - &launch_new_reduce_global_fcn>); + &launch_new_reduce_global_fcn>); resources::Cuda cuda_res = res.get(); @@ -158,14 +162,17 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - if constexpr(LaunchContextT::hasDim3) { + if constexpr (LaunchContextT::hasDim3) + { LaunchContextT ctx(threadIdx, blockDim); - ctx.shared_mem_ptr = raja_shmem_ptr; - RAJA::expt::invoke_body(reduce_params, body, ctx); - } else { + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } + else + { LaunchContextT ctx; - ctx.shared_mem_ptr = raja_shmem_ptr; - RAJA::expt::invoke_body(reduce_params, body, ctx); + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); } // Using a flatten global policy as we may use all dimensions @@ -173,9 +180,15 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ RAJA::cuda_flatten_global_xyz_direct {}, reduce_params); } -template +template struct LaunchExecute< - RAJA::policy::cuda::cuda_launch_explicit_t> + RAJA::policy::cuda::cuda_launch_explicit_t> { template @@ -191,11 +204,12 @@ struct LaunchExecute< // Use a generic block size policy here to match that used in // parampack_combine using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t< - async, named_usage::unspecified, named_usage::unspecified>; + async, named_usage::unspecified, named_usage::unspecified>; EXEC_POL pol {}; auto func = reinterpret_cast( - &launch_new_reduce_global_fcn_fixed>); resources::Cuda cuda_res = res.get(); @@ -281,9 +295,10 @@ struct LoopExecute, SEGMENT> { template - static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT const& ctx, - SEGMENT const& segment, - BODY const& body) + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContextT const& ctx, + SEGMENT const& segment, + BODY const& body) { const int len = segment.end() - segment.begin(); diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 0dca597f1d..1526154493 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -362,9 +362,9 @@ struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t< }; template + int num_threads = named_usage::unspecified, + size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, + typename LaunchContextPolicy = LaunchContextDefaultPolicy> struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::cuda, @@ -1727,15 +1727,16 @@ using policy::cuda::cuda_synchronize; // policies usable with launch template + int num_threads = named_usage::unspecified, + size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, + typename LaunchContextPolicy = LaunchContextDefaultPolicy> using cuda_launch_explicit_t = policy::cuda::cuda_launch_explicit_t; // CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads -template +template using cuda_launch_t = policy::cuda::cuda_launch_explicit_t::hasDim3) { + if constexpr (LaunchContextT::hasDim3) + { LaunchContextT ctx(threadIdx, blockDim); ctx.shared_mem_ptr = raja_shmem_ptr; RAJA::expt::invoke_body(reduce_params, body, ctx); - } else { + } + else + { LaunchContextT ctx; ctx.shared_mem_ptr = raja_shmem_ptr; RAJA::expt::invoke_body(reduce_params, body, ctx); @@ -56,8 +59,9 @@ __global__ void launch_new_reduce_global_fcn(const BODY body_in, } template -struct LaunchExecute< - RAJA::policy::hip::hip_launch_t> +struct LaunchExecute> { template @@ -139,7 +143,10 @@ struct LaunchExecute< } }; -template +template __launch_bounds__(num_threads, 1) __global__ void launch_new_reduce_global_fcn_fixed(const BODY body_in, ReduceParams reduce_params) @@ -152,11 +159,14 @@ __launch_bounds__(num_threads, 1) __global__ // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - if constexpr(LaunchContextT::hasDim3) { + if constexpr (LaunchContextT::hasDim3) + { LaunchContextT ctx(threadIdx, blockDim); ctx.shared_mem_ptr = raja_shmem_ptr; RAJA::expt::invoke_body(reduce_params, body, ctx); - } else { + } + else + { LaunchContextT ctx; ctx.shared_mem_ptr = raja_shmem_ptr; RAJA::expt::invoke_body(reduce_params, body, ctx); @@ -280,9 +290,10 @@ struct LoopExecute, SEGMENT> { template - static RAJA_INLINE RAJA_DEVICE void exec(LaunchContextT const& ctx, - SEGMENT const& segment, - BODY const& body) + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContextT const& ctx, + SEGMENT const& segment, + BODY const& body) { const int len = segment.end() - segment.begin(); diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 0a94af3d8d..f6f09b379e 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -360,8 +360,8 @@ struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t< }; template + int num_threads = named_usage::unspecified, + typename LaunchContextPolicy = LaunchContextDefaultPolicy> struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::hip, RAJA::Pattern::region, From d21c41f8158fa633d507574af4c9b9f5444427ff Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 18 Dec 2025 07:50:05 -0800 Subject: [PATCH 12/29] omp backend --- include/RAJA/policy/openmp/launch.hpp | 88 +++++++++++------------ include/RAJA/policy/openmp/policy.hpp | 18 +++-- include/RAJA/policy/sequential/launch.hpp | 32 ++++----- 3 files changed, 73 insertions(+), 65 deletions(-) diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp index 2ba8066fe9..a237911511 100644 --- a/include/RAJA/policy/openmp/launch.hpp +++ b/include/RAJA/policy/openmp/launch.hpp @@ -24,11 +24,10 @@ namespace RAJA { -template<> -struct LaunchExecute +template +struct LaunchExecute> { - template static concepts::enable_if_t< resources::EventProxy, @@ -46,12 +45,13 @@ struct LaunchExecute using BodyType = decltype(thread_privatize(body)); auto parallel_section = [&](ReduceParams& f_params, auto func) { - LaunchContext ctx; + LaunchContextT ctx; auto loop_body = thread_privatize(body); - static_assert(std::is_invocable::value, - "Internal RAJA error: Check the parallel kernel passed to " - "OpenMP Parallel section in openmp/launch.hpp"); + static_assert( + std::is_invocable&>::value, + "Internal RAJA error: Check the parallel kernel passed to " + "OpenMP Parallel section in openmp/launch.hpp"); ctx.shared_mem_ptr = (char*)malloc(launch_params.shared_mem_size); @@ -74,7 +74,7 @@ struct LaunchExecute // pragma so that the reduction parameter pack it operates on is the // version tracked by the combine OpenMP syntax auto parallel_kernel = [&](ReduceParams& f_params, BodyType& body, - LaunchContext& ctx) { + LaunchContextT& ctx) { expt::invoke_body(f_params, body.get_priv(), ctx); }; parallel_section(f_params, parallel_kernel); @@ -84,7 +84,7 @@ struct LaunchExecute { RAJA::region([&]() { auto parallel_kernel = [&](ReduceParams&, BodyType& body, - LaunchContext& ctx) { + LaunchContextT& ctx) { body.get_priv()(ctx); }; parallel_section(f_params, parallel_kernel); @@ -101,9 +101,9 @@ template struct LoopExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -121,9 +121,9 @@ struct LoopExecute }); } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -149,9 +149,9 @@ struct LoopExecute }); } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -187,9 +187,9 @@ template struct LoopExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -203,9 +203,9 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -225,9 +225,9 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -260,9 +260,9 @@ template struct LoopICountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -276,9 +276,9 @@ struct LoopICountExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -298,9 +298,9 @@ struct LoopICountExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -333,9 +333,9 @@ template struct LoopExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -361,9 +361,9 @@ struct LoopExecute }); } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -400,9 +400,9 @@ template struct LoopICountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -428,9 +428,9 @@ struct LoopICountExecute }); } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -466,9 +466,9 @@ template struct TileExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -493,9 +493,9 @@ template struct TileTCountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -522,9 +522,9 @@ template struct TileExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -543,9 +543,9 @@ template struct TileTCountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) diff --git a/include/RAJA/policy/openmp/policy.hpp b/include/RAJA/policy/openmp/policy.hpp index 0e07af334a..d2b689dc1a 100644 --- a/include/RAJA/policy/openmp/policy.hpp +++ b/include/RAJA/policy/openmp/policy.hpp @@ -20,6 +20,8 @@ #include +#include "RAJA/pattern/launch/launch_context_policy.hpp" + #include "RAJA/policy/PolicyBase.hpp" // Rely on builtin_atomic when OpenMP can't do the job @@ -138,10 +140,12 @@ struct omp_parallel_region /// /// Struct supporting OpenMP parallel region for Teams /// -struct omp_launch_t : make_policy_pattern_launch_platform_t +template +struct omp_launch_typed + : make_policy_pattern_launch_platform_t {}; /// @@ -456,7 +460,11 @@ using policy::omp::omp_for_runtime_exec; /// /// Type aliases for omp parallel region /// -using policy::omp::omp_launch_t; +template +using omp_launch_typed = policy::omp::omp_launch_typed; +using omp_launch_t = omp_launch_typed; + +// using policy::omp::omp_launch_t; using policy::omp::omp_parallel_region; /// diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp index ee98804ecf..87bb92f27a 100644 --- a/include/RAJA/policy/sequential/launch.hpp +++ b/include/RAJA/policy/sequential/launch.hpp @@ -113,9 +113,9 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -127,9 +127,9 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -149,9 +149,9 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -181,9 +181,9 @@ template struct LoopICountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -194,9 +194,9 @@ struct LoopICountExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -216,9 +216,9 @@ struct LoopICountExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -250,9 +250,9 @@ template struct TileExecute { - template + template static RAJA_HOST_DEVICE RAJA_INLINE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -271,9 +271,9 @@ template struct TileTCountExecute { - template + template static RAJA_HOST_DEVICE RAJA_INLINE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) From 40a5c1be7f7eea444df553bbb7f04521b8f5323b Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 18 Dec 2025 08:05:08 -0800 Subject: [PATCH 13/29] seq backend + make style --- include/RAJA/policy/sequential/launch.hpp | 8 ++++---- include/RAJA/policy/sequential/policy.hpp | 18 +++++++++++++----- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp index 87bb92f27a..be1ff98237 100644 --- a/include/RAJA/policy/sequential/launch.hpp +++ b/include/RAJA/policy/sequential/launch.hpp @@ -36,8 +36,8 @@ struct LaunchExecute } }; -template<> -struct LaunchExecute +template +struct LaunchExecute> { template @@ -51,7 +51,7 @@ struct LaunchExecute ReduceParams& RAJA_UNUSED_ARG(ReduceParams)) { - LaunchContext ctx; + LaunchContextT ctx; char* kernel_local_mem = new char[params.shared_mem_size]; ctx.shared_mem_ptr = kernel_local_mem; @@ -80,7 +80,7 @@ struct LaunchExecute expt::ParamMultiplexer::parampack_init(pol, launch_reducers); - LaunchContext ctx; + LaunchContextT ctx; char* kernel_local_mem = new char[launch_params.shared_mem_size]; ctx.shared_mem_ptr = kernel_local_mem; diff --git a/include/RAJA/policy/sequential/policy.hpp b/include/RAJA/policy/sequential/policy.hpp index 9ead6375ae..6a2d88d883 100644 --- a/include/RAJA/policy/sequential/policy.hpp +++ b/include/RAJA/policy/sequential/policy.hpp @@ -18,6 +18,8 @@ #ifndef policy_sequential_HPP #define policy_sequential_HPP +#include "RAJA/pattern/launch/launch_context_policy.hpp" + #include "RAJA/policy/PolicyBase.hpp" namespace RAJA @@ -63,10 +65,12 @@ struct seq_region : make_policy_pattern_launch_platform_t {}; -struct seq_launch_t : make_policy_pattern_launch_platform_t +template +struct seq_launch_typed + : make_policy_pattern_launch_platform_t {}; struct seq_exec : make_policy_pattern_launch_platform_t +using seq_launch_typed = + policy::sequential::seq_launch_typed; +using seq_launch_t = seq_launch_typed; using policy::sequential::seq_multi_reduce; using policy::sequential::seq_reduce; using policy::sequential::seq_region; From e0f48250355f03e7efeb67a43326e300620ea64b Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 18 Dec 2025 08:07:02 -0800 Subject: [PATCH 14/29] clean up pass --- include/RAJA/policy/openmp/launch.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp index a237911511..433ded4381 100644 --- a/include/RAJA/policy/openmp/launch.hpp +++ b/include/RAJA/policy/openmp/launch.hpp @@ -25,7 +25,7 @@ namespace RAJA { template -struct LaunchExecute> +struct LaunchExecute> { template From 96e99d5c6e39da7670d3f7330761c87aa710fb8f Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 18 Dec 2025 08:10:07 -0800 Subject: [PATCH 15/29] Update include/RAJA/pattern/launch/launch_context_policy.hpp --- include/RAJA/pattern/launch/launch_context_policy.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/RAJA/pattern/launch/launch_context_policy.hpp b/include/RAJA/pattern/launch/launch_context_policy.hpp index c50a5484a8..bf05698700 100644 --- a/include/RAJA/pattern/launch/launch_context_policy.hpp +++ b/include/RAJA/pattern/launch/launch_context_policy.hpp @@ -3,7 +3,7 @@ * * \file * - * \brief RAJA header file containing the core components of RAJA::launch + * \brief RAJA header file containing template types of RAJA::LaunchContextT * ****************************************************************************** */ From a9f0cca853ad3d442df44b113b4167362143a245 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 18 Dec 2025 08:53:55 -0800 Subject: [PATCH 16/29] minor clean up --- examples/raja-launch.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/raja-launch.cpp b/examples/raja-launch.cpp index 0c60ae9b16..cf345822a6 100644 --- a/examples/raja-launch.cpp +++ b/examples/raja-launch.cpp @@ -36,7 +36,8 @@ */ using launch_policy = RAJA::LaunchPolicy< #if defined(RAJA_ENABLE_OPENMP) - RAJA::omp_launch_t +RAJA::omp_launch_typed +//RAJA::seq_launch_typed #else RAJA::seq_launch_t #endif @@ -46,7 +47,7 @@ using launch_policy = RAJA::LaunchPolicy< #endif #if defined(RAJA_ENABLE_HIP) , - RAJA::hip_launch_t + RAJA::hip_launch_t #endif >; @@ -153,7 +154,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::launch (select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + [=] RAJA_HOST_DEVICE(RAJA::LaunchContextT ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) { From 7d4595b3d75e72b389cfe5353a54cc1b29935fca Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 18 Dec 2025 08:54:10 -0800 Subject: [PATCH 17/29] minor clean up --- include/RAJA/pattern/launch/launch_core.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index ce58a6f88f..8fa7639227 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -254,10 +254,9 @@ class LaunchContextT : public LaunchContextBase dim3 thread_id; dim3 block_dim; - RAJA_HOST_DEVICE LaunchContextT() : LaunchContextBase(), thread_id(), block_dim() {} - RAJA_HOST_DEVICE + RAJA_DEVICE LaunchContextT(dim3 thread, dim3 block) : LaunchContextBase(), thread_id(thread), From c990a4f033c310a2c9fff9df66f1d0b9f9af3447 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 18 Dec 2025 09:27:35 -0800 Subject: [PATCH 18/29] revert changes to example --- examples/raja-launch.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/raja-launch.cpp b/examples/raja-launch.cpp index cf345822a6..0c60ae9b16 100644 --- a/examples/raja-launch.cpp +++ b/examples/raja-launch.cpp @@ -36,8 +36,7 @@ */ using launch_policy = RAJA::LaunchPolicy< #if defined(RAJA_ENABLE_OPENMP) -RAJA::omp_launch_typed -//RAJA::seq_launch_typed + RAJA::omp_launch_t #else RAJA::seq_launch_t #endif @@ -47,7 +46,7 @@ RAJA::omp_launch_typed #endif #if defined(RAJA_ENABLE_HIP) , - RAJA::hip_launch_t + RAJA::hip_launch_t #endif >; @@ -154,7 +153,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::launch (select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContextT ctx) { + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) { From f7939fd04aaa79aef359c8ab59b3214532dfb41a Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 18 Dec 2025 12:18:23 -0800 Subject: [PATCH 19/29] remove specialization from launch policy --- .../pattern/launch/launch_context_policy.hpp | 37 +++++++++++++++++++ include/RAJA/policy/hip/launch.hpp | 35 ++++++++++-------- include/RAJA/policy/hip/policy.hpp | 5 +-- 3 files changed, 58 insertions(+), 19 deletions(-) diff --git a/include/RAJA/pattern/launch/launch_context_policy.hpp b/include/RAJA/pattern/launch/launch_context_policy.hpp index bf05698700..e74f7dbc74 100644 --- a/include/RAJA/pattern/launch/launch_context_policy.hpp +++ b/include/RAJA/pattern/launch/launch_context_policy.hpp @@ -27,5 +27,42 @@ class LaunchContextDefaultPolicy; class LaunchContextDim3Policy; #endif +namespace detail +{ + +// Primary template +template +struct function_traits; + +// Specialization for plain function pointers +template +struct function_traits { + using result_type = R; + static constexpr std::size_t arity = sizeof...(Args); + + template + struct arg { + static_assert(N < arity, "argument index out of range"); + using type = typename std::tuple_element>::type; + }; +}; + +// Specialization for const member function pointers, +// which is what a non-mutable lambda's operator() usually is. +template +struct function_traits + : function_traits {}; + +// Optional: handle mutable lambdas (non-const operator()) +template +struct function_traits + : function_traits {}; + +// Convenience alias for lambdas and other callable objects +template +using lambda_traits = function_traits; + +} + } // namespace RAJA #endif diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index dfb1600389..0e2e7dcdc9 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -28,7 +28,7 @@ namespace RAJA { -template +template __global__ void launch_new_reduce_global_fcn(const BODY body_in, ReduceParams reduce_params) { @@ -40,15 +40,18 @@ __global__ void launch_new_reduce_global_fcn(const BODY body_in, // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - if constexpr (LaunchContextT::hasDim3) + using traits = detail::lambda_traits; + using LaunchContextType = typename traits::template arg<0>::type; + + if constexpr (LaunchContextType::hasDim3) { - LaunchContextT ctx(threadIdx, blockDim); + LaunchContextType ctx(threadIdx, blockDim); ctx.shared_mem_ptr = raja_shmem_ptr; RAJA::expt::invoke_body(reduce_params, body, ctx); } else { - LaunchContextT ctx; + LaunchContextType ctx; ctx.shared_mem_ptr = raja_shmem_ptr; RAJA::expt::invoke_body(reduce_params, body, ctx); } @@ -58,10 +61,10 @@ __global__ void launch_new_reduce_global_fcn(const BODY body_in, RAJA::hip_flatten_global_xyz_direct {}, reduce_params); } -template +template struct LaunchExecute> + named_usage::unspecified + >> { template @@ -79,7 +82,7 @@ struct LaunchExecute( - &launch_new_reduce_global_fcn>); resources::Hip hip_res = res.get(); @@ -145,7 +148,6 @@ struct LaunchExecute __launch_bounds__(num_threads, 1) __global__ void launch_new_reduce_global_fcn_fixed(const BODY body_in, @@ -159,15 +161,18 @@ __launch_bounds__(num_threads, 1) __global__ // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - if constexpr (LaunchContextT::hasDim3) + using traits = detail::lambda_traits; + using LaunchContextType = typename traits::template arg<0>::type; + + if constexpr (LaunchContextType::hasDim3) { - LaunchContextT ctx(threadIdx, blockDim); + LaunchContextType ctx(threadIdx, blockDim); ctx.shared_mem_ptr = raja_shmem_ptr; RAJA::expt::invoke_body(reduce_params, body, ctx); } else { - LaunchContextT ctx; + LaunchContextType ctx; ctx.shared_mem_ptr = raja_shmem_ptr; RAJA::expt::invoke_body(reduce_params, body, ctx); } @@ -177,9 +182,9 @@ __launch_bounds__(num_threads, 1) __global__ RAJA::hip_flatten_global_xyz_direct {}, reduce_params); } -template +template struct LaunchExecute< - RAJA::policy::hip::hip_launch_t> + RAJA::policy::hip::hip_launch_t> { template @@ -199,7 +204,7 @@ struct LaunchExecute< EXEC_POL pol {}; auto func = reinterpret_cast( - &launch_new_reduce_global_fcn_fixed>); resources::Hip hip_res = res.get(); diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index f6f09b379e..e7c3ca0672 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -26,7 +26,6 @@ #include "hip/hip_runtime.h" #include "RAJA/pattern/reduce.hpp" -#include "RAJA/pattern/launch/launch_context_policy.hpp" #include "RAJA/policy/PolicyBase.hpp" #include "RAJA/policy/sequential/policy.hpp" @@ -359,9 +358,7 @@ struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t< using LaunchConcretizer = _LaunchConcretizer; }; -template +template struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::hip, RAJA::Pattern::region, From c24331caac5baadcc5769a17e73d359ec54be01d Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 18 Dec 2025 14:26:59 -0800 Subject: [PATCH 20/29] make work for function pointers --- .../pattern/launch/launch_context_policy.hpp | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/include/RAJA/pattern/launch/launch_context_policy.hpp b/include/RAJA/pattern/launch/launch_context_policy.hpp index e74f7dbc74..46c63ce570 100644 --- a/include/RAJA/pattern/launch/launch_context_policy.hpp +++ b/include/RAJA/pattern/launch/launch_context_policy.hpp @@ -59,8 +59,28 @@ struct function_traits : function_traits {}; // Convenience alias for lambdas and other callable objects -template -using lambda_traits = function_traits; + +//add a conditional if a function pointer is provided +//is pointer a type just pass it through otherwise do give me the operator +//static error if not a function type + +//template +//using lambda_traits = function_traits; + +// Helper to strip cv/ref from a type +template +using decay_t = typename std::decay::type; + +// Convenience alias for callable entities: +// - If T is a function pointer, use function_traits directly +// - Otherwise, assume it is a callable object and use &T::operator() +template +using lambda_traits = + std::conditional_t< + std::is_pointer>::value, + function_traits>, + function_traits::operator())> + >; } From 0518138bb49a9f469427289102f893d31db438f2 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Fri, 19 Dec 2025 01:02:45 -0800 Subject: [PATCH 21/29] store dim3 based on launch context type - hip --- .../pattern/launch/launch_context_policy.hpp | 96 ++++++++++++------- include/RAJA/pattern/launch/launch_core.hpp | 3 - include/RAJA/policy/hip/launch.hpp | 23 ++--- 3 files changed, 69 insertions(+), 53 deletions(-) diff --git a/include/RAJA/pattern/launch/launch_context_policy.hpp b/include/RAJA/pattern/launch/launch_context_policy.hpp index 46c63ce570..49e3d03625 100644 --- a/include/RAJA/pattern/launch/launch_context_policy.hpp +++ b/include/RAJA/pattern/launch/launch_context_policy.hpp @@ -21,6 +21,9 @@ namespace RAJA { +template +class LaunchContextT; + class LaunchContextDefaultPolicy; #if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) @@ -30,59 +33,80 @@ class LaunchContextDim3Policy; namespace detail { -// Primary template + +template < typename T, typename = void > +struct has_single_call_operator : std::false_type {}; + +template < typename T > +struct has_single_call_operator::operator()), void>>> : std::true_type {}; + + template -struct function_traits; +struct function_traits{}; -// Specialization for plain function pointers template -struct function_traits { - using result_type = R; - static constexpr std::size_t arity = sizeof...(Args); - - template - struct arg { - static_assert(N < arity, "argument index out of range"); - using type = typename std::tuple_element>::type; - }; +struct function_traits { + using result_type = R; + static constexpr std::size_t arity = sizeof...(Args); + + template + struct arg { + static_assert(N < arity, "argument index out of range"); + using type = typename std::tuple_element>::type; + }; }; -// Specialization for const member function pointers, -// which is what a non-mutable lambda's operator() usually is. +template +struct function_traits : function_traits {}; + +template +struct function_traits : function_traits {}; + template struct function_traits - : function_traits {}; + : function_traits { + using functional_type = C; +}; -// Optional: handle mutable lambdas (non-const operator()) template struct function_traits - : function_traits {}; + : function_traits { + using functional_type = C; +}; + -// Convenience alias for lambdas and other callable objects +template >::value> +struct functional_traits : function_traits> {}; -//add a conditional if a function pointer is provided -//is pointer a type just pass it through otherwise do give me the operator -//static error if not a function type +template +struct functional_traits : function_traits::operator())> {}; -//template -//using lambda_traits = function_traits; -// Helper to strip cv/ref from a type +template +struct has_arg0 : std::false_type {}; + template -using decay_t = typename std::decay::type; +struct has_arg0< + T, + typename std::enable_if_t< + !std::is_same_v::template arg<0>::type, void> + > +> : std::true_type {}; + + +template ::value> +struct launch_context_type { + using type = LaunchContextT; +}; -// Convenience alias for callable entities: -// - If T is a function pointer, use function_traits directly -// - Otherwise, assume it is a callable object and use &T::operator() template -using lambda_traits = - std::conditional_t< - std::is_pointer>::value, - function_traits>, - function_traits::operator())> - >; - -} +struct launch_context_type { + using type = typename functional_traits::template arg<0>::type; +}; + + +} // namespace detail } // namespace RAJA #endif diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index 8fa7639227..2739cca8d4 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -229,9 +229,6 @@ class LaunchContextBase } }; -template -class LaunchContextT; - template<> class LaunchContextT : public LaunchContextBase { diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index 0e2e7dcdc9..e0f2c431ad 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -40,8 +40,8 @@ __global__ void launch_new_reduce_global_fcn(const BODY body_in, // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - using traits = detail::lambda_traits; - using LaunchContextType = typename traits::template arg<0>::type; + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; if constexpr (LaunchContextType::hasDim3) { @@ -62,9 +62,8 @@ __global__ void launch_new_reduce_global_fcn(const BODY body_in, } template -struct LaunchExecute> +struct LaunchExecute< + RAJA::policy::hip::hip_launch_t> { template @@ -82,8 +81,7 @@ struct LaunchExecute( - &launch_new_reduce_global_fcn>); + &launch_new_reduce_global_fcn>); resources::Hip hip_res = res.get(); @@ -146,9 +144,7 @@ struct LaunchExecute +template __launch_bounds__(num_threads, 1) __global__ void launch_new_reduce_global_fcn_fixed(const BODY body_in, ReduceParams reduce_params) @@ -161,8 +157,8 @@ __launch_bounds__(num_threads, 1) __global__ // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - using traits = detail::lambda_traits; - using LaunchContextType = typename traits::template arg<0>::type; + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; if constexpr (LaunchContextType::hasDim3) { @@ -183,8 +179,7 @@ __launch_bounds__(num_threads, 1) __global__ } template -struct LaunchExecute< - RAJA::policy::hip::hip_launch_t> +struct LaunchExecute> { template From d5da29a593d7f7e6da6ca0dd5cc523891f66e279 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Fri, 19 Dec 2025 01:10:14 -0800 Subject: [PATCH 22/29] rework omp backend --- include/RAJA/policy/openmp/launch.hpp | 23 +++++++++++++---------- include/RAJA/policy/openmp/policy.hpp | 18 +++++------------- 2 files changed, 18 insertions(+), 23 deletions(-) diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp index 433ded4381..6b3255c094 100644 --- a/include/RAJA/policy/openmp/launch.hpp +++ b/include/RAJA/policy/openmp/launch.hpp @@ -24,8 +24,8 @@ namespace RAJA { -template -struct LaunchExecute> +template<> +struct LaunchExecute { template @@ -44,14 +44,17 @@ struct LaunchExecute> EXEC_POL pol {}; using BodyType = decltype(thread_privatize(body)); + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; + auto parallel_section = [&](ReduceParams& f_params, auto func) { - LaunchContextT ctx; + LaunchContextType ctx; + auto loop_body = thread_privatize(body); - static_assert( - std::is_invocable&>::value, - "Internal RAJA error: Check the parallel kernel passed to " - "OpenMP Parallel section in openmp/launch.hpp"); + static_assert(std::is_invocable::value, + "Internal RAJA error: Check the parallel kernel passed to " + "OpenMP Parallel section in openmp/launch.hpp"); ctx.shared_mem_ptr = (char*)malloc(launch_params.shared_mem_size); @@ -74,7 +77,7 @@ struct LaunchExecute> // pragma so that the reduction parameter pack it operates on is the // version tracked by the combine OpenMP syntax auto parallel_kernel = [&](ReduceParams& f_params, BodyType& body, - LaunchContextT& ctx) { + LaunchContextType& ctx) { expt::invoke_body(f_params, body.get_priv(), ctx); }; parallel_section(f_params, parallel_kernel); @@ -84,7 +87,7 @@ struct LaunchExecute> { RAJA::region([&]() { auto parallel_kernel = [&](ReduceParams&, BodyType& body, - LaunchContextT& ctx) { + LaunchContextType& ctx) { body.get_priv()(ctx); }; parallel_section(f_params, parallel_kernel); diff --git a/include/RAJA/policy/openmp/policy.hpp b/include/RAJA/policy/openmp/policy.hpp index d2b689dc1a..0e07af334a 100644 --- a/include/RAJA/policy/openmp/policy.hpp +++ b/include/RAJA/policy/openmp/policy.hpp @@ -20,8 +20,6 @@ #include -#include "RAJA/pattern/launch/launch_context_policy.hpp" - #include "RAJA/policy/PolicyBase.hpp" // Rely on builtin_atomic when OpenMP can't do the job @@ -140,12 +138,10 @@ struct omp_parallel_region /// /// Struct supporting OpenMP parallel region for Teams /// -template -struct omp_launch_typed - : make_policy_pattern_launch_platform_t +struct omp_launch_t : make_policy_pattern_launch_platform_t {}; /// @@ -460,11 +456,7 @@ using policy::omp::omp_for_runtime_exec; /// /// Type aliases for omp parallel region /// -template -using omp_launch_typed = policy::omp::omp_launch_typed; -using omp_launch_t = omp_launch_typed; - -// using policy::omp::omp_launch_t; +using policy::omp::omp_launch_t; using policy::omp::omp_parallel_region; /// From af88dbb6738412407da7833f1800a8ac5d3fbefe Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Fri, 19 Dec 2025 01:23:05 -0800 Subject: [PATCH 23/29] update sequential backend --- include/RAJA/policy/sequential/launch.hpp | 14 ++++++++++---- include/RAJA/policy/sequential/policy.hpp | 18 +++++------------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp index be1ff98237..742a229412 100644 --- a/include/RAJA/policy/sequential/launch.hpp +++ b/include/RAJA/policy/sequential/launch.hpp @@ -36,8 +36,8 @@ struct LaunchExecute } }; -template -struct LaunchExecute> +template<> +struct LaunchExecute { template @@ -51,7 +51,10 @@ struct LaunchExecute> ReduceParams& RAJA_UNUSED_ARG(ReduceParams)) { - LaunchContextT ctx; + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; + + LaunchContextType ctx; char* kernel_local_mem = new char[params.shared_mem_size]; ctx.shared_mem_ptr = kernel_local_mem; @@ -80,7 +83,10 @@ struct LaunchExecute> expt::ParamMultiplexer::parampack_init(pol, launch_reducers); - LaunchContextT ctx; + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; + + LaunchContextType ctx; char* kernel_local_mem = new char[launch_params.shared_mem_size]; ctx.shared_mem_ptr = kernel_local_mem; diff --git a/include/RAJA/policy/sequential/policy.hpp b/include/RAJA/policy/sequential/policy.hpp index 6a2d88d883..9ead6375ae 100644 --- a/include/RAJA/policy/sequential/policy.hpp +++ b/include/RAJA/policy/sequential/policy.hpp @@ -18,8 +18,6 @@ #ifndef policy_sequential_HPP #define policy_sequential_HPP -#include "RAJA/pattern/launch/launch_context_policy.hpp" - #include "RAJA/policy/PolicyBase.hpp" namespace RAJA @@ -65,12 +63,10 @@ struct seq_region : make_policy_pattern_launch_platform_t {}; -template -struct seq_launch_typed - : make_policy_pattern_launch_platform_t +struct seq_launch_t : make_policy_pattern_launch_platform_t {}; struct seq_exec : make_policy_pattern_launch_platform_t -using seq_launch_typed = - policy::sequential::seq_launch_typed; -using seq_launch_t = seq_launch_typed; +using policy::sequential::seq_launch_t; using policy::sequential::seq_multi_reduce; using policy::sequential::seq_reduce; using policy::sequential::seq_region; From 21ad0a8a00e48f7409f3fc49333eaf1c3ad9e461 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Fri, 19 Dec 2025 01:42:02 -0800 Subject: [PATCH 24/29] get things building for cuda -- need a good clean up pass --- include/RAJA/policy/cuda/launch.hpp | 48 +++++++++++++---------------- include/RAJA/policy/cuda/policy.hpp | 15 +++------ 2 files changed, 27 insertions(+), 36 deletions(-) diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index cfa1607a3c..eb4d4d8f2d 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -28,7 +28,7 @@ namespace RAJA { -template +template __global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY body_in, ReduceParams reduce_params) @@ -40,15 +40,18 @@ __global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - if constexpr (LaunchContextT::hasDim3) + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; + + if constexpr (LaunchContextType::hasDim3) { - LaunchContextT ctx(threadIdx, blockDim); + LaunchContextType ctx(threadIdx, blockDim); ctx.shared_mem_ptr = raja_shmem_ptr; RAJA::expt::invoke_body(reduce_params, body, ctx); } else { - LaunchContextT ctx; + LaunchContextType ctx; ctx.shared_mem_ptr = raja_shmem_ptr; RAJA::expt::invoke_body(reduce_params, body, ctx); } @@ -58,12 +61,11 @@ __global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY RAJA::cuda_flatten_global_xyz_direct {}, reduce_params); } -template +template struct LaunchExecute< RAJA::policy::cuda::cuda_launch_explicit_t> + named_usage::unspecified>> { template @@ -81,8 +83,7 @@ struct LaunchExecute< EXEC_POL pol {}; auto func = reinterpret_cast( - &launch_new_reduce_global_fcn>); + &launch_new_reduce_global_fcn>); resources::Cuda cuda_res = res.get(); @@ -147,7 +148,6 @@ struct LaunchExecute< template __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ void launch_new_reduce_global_fcn_fixed(const RAJA_CUDA_GRID_CONSTANT BODY @@ -162,15 +162,18 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - if constexpr (LaunchContextT::hasDim3) + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; + + if constexpr (LaunchContextType::hasDim3) { - LaunchContextT ctx(threadIdx, blockDim); + LaunchContextType ctx(threadIdx, blockDim); ctx.shared_mem_ptr = raja_shmem_ptr; RAJA::expt::invoke_body(reduce_params, body, ctx); } else { - LaunchContextT ctx; + LaunchContextType ctx; ctx.shared_mem_ptr = raja_shmem_ptr; RAJA::expt::invoke_body(reduce_params, body, ctx); } @@ -180,15 +183,9 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ RAJA::cuda_flatten_global_xyz_direct {}, reduce_params); } -template +template struct LaunchExecute< - RAJA::policy::cuda::cuda_launch_explicit_t> + RAJA::policy::cuda::cuda_launch_explicit_t> { template @@ -209,7 +206,6 @@ struct LaunchExecute< auto func = reinterpret_cast( &launch_new_reduce_global_fcn_fixed>); resources::Cuda cuda_res = res.get(); @@ -408,9 +404,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -508,9 +504,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 1526154493..d521dede10 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -26,7 +26,6 @@ #include #include "RAJA/pattern/reduce.hpp" -#include "RAJA/pattern/launch/launch_context_policy.hpp" #include "RAJA/policy/PolicyBase.hpp" #include "RAJA/policy/sequential/policy.hpp" @@ -362,9 +361,8 @@ struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t< }; template + int num_threads = named_usage::unspecified, + size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM> struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::cuda, @@ -1727,16 +1725,13 @@ using policy::cuda::cuda_synchronize; // policies usable with launch template + int num_threads = named_usage::unspecified, + size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM> using cuda_launch_explicit_t = policy::cuda::cuda_launch_explicit_t; // CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads -template +template using cuda_launch_t = policy::cuda::cuda_launch_explicit_t Date: Fri, 19 Dec 2025 01:59:57 -0800 Subject: [PATCH 25/29] cuda clean up pass --- include/RAJA/policy/cuda/launch.hpp | 148 ++++++++++++++-------------- 1 file changed, 74 insertions(+), 74 deletions(-) diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index eb4d4d8f2d..fa6e415d35 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -432,9 +432,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -468,9 +468,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -534,9 +534,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -579,9 +579,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -630,9 +630,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -654,9 +654,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -684,9 +684,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -712,9 +712,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -740,9 +740,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -776,9 +776,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -812,9 +812,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -842,9 +842,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -887,9 +887,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -951,9 +951,9 @@ struct LoopExecute::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -983,9 +983,9 @@ struct LoopExecute::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1025,9 +1025,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1062,9 +1062,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1113,9 +1113,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1150,9 +1150,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1188,9 +1188,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1214,9 +1214,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1249,9 +1249,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1284,9 +1284,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1314,9 +1314,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1355,9 +1355,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1398,9 +1398,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1431,9 +1431,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1481,9 +1481,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1539,9 +1539,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1565,9 +1565,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1601,9 +1601,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1637,9 +1637,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1667,9 +1667,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1709,9 +1709,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1753,9 +1753,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1786,9 +1786,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1840,9 +1840,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, From 597641bc0ba61c2ea565782c19d23c99fdaa20c4 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Fri, 19 Dec 2025 02:26:26 -0800 Subject: [PATCH 26/29] clean up ordering in hip launch --- include/RAJA/policy/hip/launch.hpp | 84 +++++++++++++++--------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index e0f2c431ad..e9c42d81ec 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -321,7 +321,7 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template // need to finish + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, @@ -345,7 +345,7 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -375,7 +375,7 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -403,7 +403,7 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, @@ -431,7 +431,7 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -467,7 +467,7 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -503,7 +503,7 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, @@ -533,7 +533,7 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -578,7 +578,7 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -629,7 +629,7 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, @@ -653,7 +653,7 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -683,7 +683,7 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -711,7 +711,7 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, @@ -739,7 +739,7 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -775,7 +775,7 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -811,7 +811,7 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, @@ -841,7 +841,7 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -886,7 +886,7 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -950,7 +950,7 @@ struct LoopExecute::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, @@ -982,7 +982,7 @@ struct LoopExecute::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, @@ -1024,7 +1024,7 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, @@ -1061,7 +1061,7 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, @@ -1112,7 +1112,7 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, @@ -1149,7 +1149,7 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, @@ -1187,7 +1187,7 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, @@ -1213,7 +1213,7 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, @@ -1248,7 +1248,7 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, @@ -1283,7 +1283,7 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, @@ -1313,7 +1313,7 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, @@ -1354,7 +1354,7 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, @@ -1397,7 +1397,7 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, @@ -1430,7 +1430,7 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, @@ -1480,7 +1480,7 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, @@ -1538,7 +1538,7 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, @@ -1564,7 +1564,7 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, @@ -1600,7 +1600,7 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, @@ -1636,7 +1636,7 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, @@ -1666,7 +1666,7 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, @@ -1708,7 +1708,7 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, @@ -1752,7 +1752,7 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, @@ -1785,7 +1785,7 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, @@ -1839,7 +1839,7 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, From 540373785f98a7a6efb4aa39c6457eea7cde8aad Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Fri, 19 Dec 2025 02:29:24 -0800 Subject: [PATCH 27/29] clean up ordering --- include/RAJA/policy/openmp/launch.hpp | 34 +++++++++++------------ include/RAJA/policy/sequential/launch.hpp | 12 ++++---- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp index 6b3255c094..51cfaff7f9 100644 --- a/include/RAJA/policy/openmp/launch.hpp +++ b/include/RAJA/policy/openmp/launch.hpp @@ -104,7 +104,7 @@ template struct LoopExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, @@ -124,7 +124,7 @@ struct LoopExecute }); } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -152,7 +152,7 @@ struct LoopExecute }); } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -190,7 +190,7 @@ template struct LoopExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, @@ -206,7 +206,7 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -228,7 +228,7 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -263,7 +263,7 @@ template struct LoopICountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, @@ -279,7 +279,7 @@ struct LoopICountExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -301,7 +301,7 @@ struct LoopICountExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -336,7 +336,7 @@ template struct LoopExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -364,7 +364,7 @@ struct LoopExecute }); } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -403,7 +403,7 @@ template struct LoopICountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -431,7 +431,7 @@ struct LoopICountExecute }); } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -469,7 +469,7 @@ template struct TileExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, @@ -496,7 +496,7 @@ template struct TileTCountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, @@ -525,7 +525,7 @@ template struct TileExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, @@ -546,7 +546,7 @@ template struct TileTCountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp index 742a229412..26eb287eb0 100644 --- a/include/RAJA/policy/sequential/launch.hpp +++ b/include/RAJA/policy/sequential/launch.hpp @@ -119,7 +119,7 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, @@ -133,7 +133,7 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -155,7 +155,7 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -187,7 +187,7 @@ template struct LoopICountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, @@ -200,7 +200,7 @@ struct LoopICountExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, @@ -222,7 +222,7 @@ struct LoopICountExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, From e41e970e43bae0b9710141257cf91a6eb0902cf1 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Fri, 19 Dec 2025 02:38:04 -0800 Subject: [PATCH 28/29] make style --- .../pattern/launch/launch_context_policy.hpp | 114 ++++++++++-------- 1 file changed, 63 insertions(+), 51 deletions(-) diff --git a/include/RAJA/pattern/launch/launch_context_policy.hpp b/include/RAJA/pattern/launch/launch_context_policy.hpp index 49e3d03625..acf707615d 100644 --- a/include/RAJA/pattern/launch/launch_context_policy.hpp +++ b/include/RAJA/pattern/launch/launch_context_policy.hpp @@ -34,79 +34,91 @@ namespace detail { -template < typename T, typename = void > -struct has_single_call_operator : std::false_type {}; - -template < typename T > -struct has_single_call_operator::operator()), void>>> : std::true_type {}; - - -template -struct function_traits{}; - -template -struct function_traits { - using result_type = R; +template +struct has_single_call_operator : std::false_type +{}; + +template +struct has_single_call_operator< + T, + std::enable_if_t< + !std::is_same_v::operator()), void>>> + : std::true_type +{}; + +template +struct function_traits +{}; + +template +struct function_traits +{ + using result_type = R; static constexpr std::size_t arity = sizeof...(Args); - template - struct arg { + template + struct arg + { static_assert(N < arity, "argument index out of range"); using type = typename std::tuple_element>::type; }; }; -template -struct function_traits : function_traits {}; +template +struct function_traits : function_traits +{}; -template -struct function_traits : function_traits {}; +template +struct function_traits : function_traits +{}; -template -struct function_traits - : function_traits { +template +struct function_traits : function_traits +{ using functional_type = C; }; -template -struct function_traits - : function_traits { +template +struct function_traits : function_traits +{ using functional_type = C; }; - -template >::value> -struct functional_traits : function_traits> {}; - -template -struct functional_traits : function_traits::operator())> {}; - - -template -struct has_arg0 : std::false_type {}; - -template -struct has_arg0< - T, - typename std::enable_if_t< - !std::is_same_v::template arg<0>::type, void> - > -> : std::true_type {}; - - -template ::value> -struct launch_context_type { +template>::value> +struct functional_traits : function_traits> +{}; + +template +struct functional_traits + : function_traits::operator())> +{}; + +template +struct has_arg0 : std::false_type +{}; + +template +struct has_arg0::template arg<0>::type, + void>>> : std::true_type +{}; + +template::value> +struct launch_context_type +{ using type = LaunchContextT; }; -template -struct launch_context_type { +template +struct launch_context_type +{ using type = typename functional_traits::template arg<0>::type; }; -} // namespace detail +} // namespace detail } // namespace RAJA #endif From 7c95430c71b1186aca31c39c96a72f779970ef31 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Fri, 19 Dec 2025 02:55:17 -0800 Subject: [PATCH 29/29] use constexpt for getting dim values --- include/RAJA/policy/cuda/launch.hpp | 25 ++++++++++++++++++++++--- include/RAJA/policy/hip/launch.hpp | 24 ++++++++++++++++++++++-- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index fa6e415d35..3368928d1b 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -284,6 +284,24 @@ using cuda_ctx_thread_loop_x = cuda_ctx_thread_loop; using cuda_ctx_thread_loop_y = cuda_ctx_thread_loop; using cuda_ctx_thread_loop_z = cuda_ctx_thread_loop; +template +RAJA_INLINE RAJA_DEVICE int get_dim(Dim3Like const& d) +{ + if constexpr (DIM == named_dim::x) + { + return d.x; + } + else if constexpr (DIM == named_dim::y) + { + return d.y; + } + else + { + static_assert(DIM == named_dim::z, "Unsupported named_dim"); + return d.z; + } +} + } // namespace expt template @@ -296,12 +314,13 @@ struct LoopExecute, SEGMENT> SEGMENT const& segment, BODY const& body) { - const int len = segment.end() - segment.begin(); constexpr int int_dim = static_cast(DIM); - for (int i = ::RAJA::internal::CudaDimHelper::get(ctx.thread_id); - i < len; i += ::RAJA::internal::CudaDimHelper::get(ctx.block_dim)) + const int thread_idx = expt::get_dim(ctx.thread_id); + const int stride = expt::get_dim(ctx.block_dim); + + for (int i = thread_idx; i < len; i += stride) { body(*(segment.begin() + i)); } diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index e9c42d81ec..3bf98d022e 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -279,6 +279,24 @@ using hip_ctx_thread_loop_x = hip_ctx_thread_loop; using hip_ctx_thread_loop_y = hip_ctx_thread_loop; using hip_ctx_thread_loop_z = hip_ctx_thread_loop; +template +RAJA_INLINE RAJA_DEVICE int get_dim(Dim3Like const& d) +{ + if constexpr (DIM == named_dim::x) + { + return d.x; + } + else if constexpr (DIM == named_dim::y) + { + return d.y; + } + else + { + static_assert(DIM == named_dim::z, "Unsupported named_dim"); + return d.z; + } +} + } // namespace expt /* @@ -299,8 +317,10 @@ struct LoopExecute, SEGMENT> const int len = segment.end() - segment.begin(); constexpr int int_dim = static_cast(DIM); - for (int i = ::RAJA::internal::HipDimHelper::get(ctx.thread_id); - i < len; i += ::RAJA::internal::HipDimHelper::get(ctx.block_dim)) + const int thread_idx = expt::get_dim(ctx.thread_id); + const int stride = expt::get_dim(ctx.block_dim); + + for (int i = thread_idx; i < len; i += stride) { body(*(segment.begin() + i)); }