diff --git a/src/care/DefaultMacros.h b/src/care/DefaultMacros.h index 62c48072..6ca2b0b5 100644 --- a/src/care/DefaultMacros.h +++ b/src/care/DefaultMacros.h @@ -34,6 +34,12 @@ /// Used to capture variables by reference into a lambda (combine with FOR_EACH) #define CARE_REF_CAPTURE(X) , &X +#ifdef CARE_ENABLE_RACE_DETECTION +#define CARE_SET_THREAD_ID(INDEX) care::DebugPlugin::s_threadID = INDEX ; +#else +#define CARE_SET_THREAD_ID(INDEX) +#endif + @@ -89,6 +95,29 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK) } OMP_FOR_END CARE_NEST_END(CHECK) } +//////////////////////////////////////////////////////////////////////////////// +/// +/// @brief Macros that start and end a chunked vanilla OpenMP 3.0 for loop. +/// +/// @arg[in] INDEX The index variable +/// @arg[in] START_INDEX The starting index (inclusive) +/// @arg[in] END_INDEX The ending index (exclusive) +/// @arg[in] CHUNK_SIZE Maximum kernel size +/// @arg[in] CHECK The variable to check that the start and end macros match +/// +//////////////////////////////////////////////////////////////////////////////// +#define CARE_CHECKED_CHUNKED_OPENMP_FOR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) {\ + CARE_NEST_BEGIN(CHECK) \ + auto const _care_openmp_for_loop_end_ndx = END_INDEX; \ + decltype(_care_openmp_for_loop_end_ndx) _care_openmp_for_loop_ndx = START_INDEX; \ + decltype(_care_openmp_for_loop_end_ndx) _care_open_chunked_for_loop_chunk_size = CHUNK_SIZE > 0 ? CHUNK_SIZE : END_INDEX - START_INDEX ; \ + while (_care_openmp_for_loop_begin_ndx < _care_openmp_for_loop_end_ndx) { \ + decltype(_care_openmp_for_loop_end_ndx) _care_openmp_for_loop_chunk_begin_ndx = _care_openmp_for_loop_ndx ; \ + decltype(_care_openmp_for_loop_end_ndx) _care_openmp_for_loop_chunk_end_ndx = (_care_openmp_for_loop_ndx + _care_open_chunked_for_loop_chunk_size) ? _care_openmp_for_loop_ndx + _care_open_chunked_for_loop_chunk_size : _care_openmp_for_loop_end_ndx ; \ +OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_chunk_begin_ndx; INDEX < _care_openmp_for_loop_chunk_end_ndx; ++INDEX) {\ + +#define CARE_CHECKED_CHUNKED_OPENMP_FOR_LOOP_END(CHECK) } OMP_FOR_END } CARE_NEST_END(CHECK) } + @@ -133,6 +162,24 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_CHECKED_LOOP_END(CHECK) CARE_CHECKED_FOR_LOOP_END(CHECK) +//////////////////////////////////////////////////////////////////////////////// +/// +/// @brief Macros that start and end a call to forall with the given execution policy. +/// This is for compatibility with chunked GPU loops. +/// The legacy version uses a raw for loop. +/// +/// @arg[in] POLICY The execution policy +/// @arg[in] INDEX The index variable +/// @arg[in] START_INDEX The starting index (inclusive) +/// @arg[in] END_INDEX The ending index (exclusive) +/// @arg[in] CHUNK_SIZE Not used +/// @arg[in] CHECK The variable to check that the start and end macros match +/// +//////////////////////////////////////////////////////////////////////////////// +#define CARE_CHECKED_CHUNKED_LOOP_START(POLICY, INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) CARE_CHECKED_FOR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) + +#define CARE_CHECKED_CHUNKED_LOOP_END(CHECK) CARE_CHECKED_FOR_LOOP_END(CHECK) + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end a sequential RAJA loop. The legacy version @@ -206,6 +253,23 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_CHECKED_OPENMP_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK) +//////////////////////////////////////////////////////////////////////////////// +/// +/// @brief Macros that start and end a chunked OpenMP RAJA loop. If OpenMP is not +/// available, executes sequentially on the host. The legacy version +/// uses raw OpenMP. +/// +/// @arg[in] INDEX The index variable +/// @arg[in] START_INDEX The starting index (inclusive) +/// @arg[in] END_INDEX The ending index (exclusive) +/// @arg[in] CHUNK_SIZE Maximum kernel size +/// @arg[in] CHECK The variable to check that the start and end macros match +/// +//////////////////////////////////////////////////////////////////////////////// +#define CARE_CHECKED_CHUNKED_OPENMP_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) CARE_CHECKED_CHUNKED_OPENMP_FOR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) + +#define CARE_CHECKED_CHUNKED_OPENMP_LOOP_END(CHECK) CARE_CHECKED_CHUNKED_OPENMP_FOR_LOOP_END(CHECK) + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end an OpenMP RAJA loop that captures some @@ -237,6 +301,22 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_CHECKED_GPU_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK) +//////////////////////////////////////////////////////////////////////////////// +/// +/// @brief Macros that start and end a chunked GPU RAJA loop. If GPU is not available, +/// executes sequentially on the host. The legacy version uses raw OpenMP. +/// +/// @arg[in] INDEX The index variable +/// @arg[in] START_INDEX The starting index (inclusive) +/// @arg[in] END_INDEX The ending index (exclusive) +/// @arg[in] CHUNK_SIZE Maximum kernel size +/// @arg[in] CHECK The variable to check that the start and end macros match +/// +//////////////////////////////////////////////////////////////////////////////// +#define CARE_CHECKED_CHUNKED_GPU_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) + +#define CARE_CHECKED_CHUNKED_GPU_LOOP_END(CHECK) CARE_CHECKED_CHUNKED_OPENMP_FOR_LOOP_END(CHECK) + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is @@ -272,6 +352,29 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_CHECKED_REDUCE_LOOP_END(CHECK) CARE_CHECKED_PARALLEL_LOOP_END(CHECK) +//////////////////////////////////////////////////////////////////////////////// +/// +/// @brief Macros that start and end a chunked parallel RAJA loop. If GPU is available, +/// executes on the device. If GPU is not available but OpenMP is, +/// executes in parallel on the host. Otherwise, executes sequentially +/// on the host. The legacy version uses raw OpenMP. +/// +/// @arg[in] INDEX The index variable +/// @arg[in] START_INDEX The starting index (inclusive) +/// @arg[in] END_INDEX The ending index (exclusive) +/// @arg[in] CHUNK_SIZE Maximum kernel size +/// @arg[in] CHECK The variable to check that the start and end macros match +/// +//////////////////////////////////////////////////////////////////////////////// +#define CARE_CHECKED_CHUNKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) + +#define CARE_CHECKED_CHUNKED_PARALLEL_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK) + +#define CARE_CHECKED_CHUNKED_REDUCE_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) \ + CARE_CHECKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) + +#define CARE_CHECKED_CHUNKED_REDUCE_LOOP_END(CHECK) CARE_CHECKED_CHUNKED_PARALLEL_LOOP_END(CHECK) + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is @@ -305,6 +408,26 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_CHECKED_MANAGED_PTR_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK) +//////////////////////////////////////////////////////////////////////////////// +/// +/// @brief Macros that start and end a chunked RAJA loop that uses at least one +/// managed_ptr. If GPU is available, and managed_ptr is available +/// on the device, executes on the device. If GPU is not available +/// but OpenMP is, executes in parallel on the host. Otherwise, +/// executes sequentially on the host. The legacy version uses raw +/// OpenMP. +/// +/// @arg[in] INDEX The index variable +/// @arg[in] START_INDEX The starting index (inclusive) +/// @arg[in] END_INDEX The ending index (exclusive) +/// @arg[in] CHUNK_SIZE Maximum kernel size +/// @arg[in] CHECK The variable to check that the start and end macros match +/// +//////////////////////////////////////////////////////////////////////////////// +#define CARE_CHECKED_CHUNKED_MANAGED_PTR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) CARE_CHECKED_CHUNKED_OPENMP_FOR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) + +#define CARE_CHECKED_CHUNKED_MANAGED_PTR_LOOP_END(CHECK) CARE_CHECKED_CHUNKED_OPENMP_FOR_LOOP_END(CHECK) + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros for updating/initializing managed_ptrs. @@ -369,11 +492,31 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_CHECKED_LOOP_START(POLICY, INDEX, START_INDEX, END_INDEX, CHECK) { \ if (END_INDEX > START_INDEX) { \ CARE_NEST_BEGIN(CHECK) \ - care::forall(POLICY, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] CARE_HOST_DEVICE (const int INDEX) { + care::forall(POLICY, __FILE__, __LINE__, START_INDEX, END_INDEX, 0, [=] CARE_HOST_DEVICE (const int INDEX) { #define CARE_CHECKED_LOOP_END(CHECK) }); \ CARE_NEST_END(CHECK) }} +//////////////////////////////////////////////////////////////////////////////// +/// +/// @brief Macros that start and end a call to a chunked forall with the given execution policy. +/// +/// @arg[in] POLICY The execution policy +/// @arg[in] INDEX The index variable +/// @arg[in] START_INDEX The starting index (inclusive) +/// @arg[in] END_INDEX The ending index (exclusive) +/// @arg[in] CHUNK_SIZE Maximum chunk size for each kernel +/// @arg[in] CHECK The variable to check that the start and end macros match +/// +//////////////////////////////////////////////////////////////////////////////// +#define CARE_CHECKED_CHUNKED_LOOP_START(POLICY, INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) { \ + if (END_INDEX > START_INDEX) { \ + CARE_NEST_BEGIN(CHECK) \ + care::forall(POLICY, __FILE__, __LINE__, START_INDEX, END_INDEX, CHUNK_SIZE, [=] CARE_HOST_DEVICE (const int INDEX) { + +#define CARE_CHECKED_CHUNKED_LOOP_END(CHECK) }); \ + CARE_NEST_END(CHECK) }} + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end a sequential RAJA loop. @@ -387,7 +530,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_CHECKED_SEQUENTIAL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) { \ if (END_INDEX > START_INDEX) { \ CARE_NEST_BEGIN(CHECK) \ - care::forall(care::sequential{}, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] (const int INDEX) { + care::forall(care::sequential{}, __FILE__, __LINE__, START_INDEX, END_INDEX, 0, [=] (const int INDEX) { #define CARE_CHECKED_SEQUENTIAL_LOOP_END(CHECK) }); \ CARE_NEST_END(CHECK) }} @@ -407,7 +550,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_CHECKED_SEQUENTIAL_LOOP_WITH_REF_START(INDEX, START_INDEX, END_INDEX, CHECK, ...) { \ if (END_INDEX > START_INDEX) { \ CARE_NEST_BEGIN(CHECK) \ - care::forall(care::sequential{}, __FILE__, __LINE__, START_INDEX, END_INDEX, [= FOR_EACH(CARE_REF_CAPTURE, __VA_ARGS__)] (const int INDEX) { + care::forall(care::sequential{}, __FILE__, __LINE__, START_INDEX, END_INDEX, 0, [= FOR_EACH(CARE_REF_CAPTURE, __VA_ARGS__)] (const int INDEX) { #define CARE_CHECKED_SEQUENTIAL_LOOP_WITH_REF_END(CHECK) }); \ CARE_NEST_END(CHECK) }} @@ -421,7 +564,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o //////////////////////////////////////////////////////////////////////////////// #define CARE_CHECKED_HOST_KERNEL_START(CHECK) { \ CARE_NEST_BEGIN(CHECK) \ - care::forall(care::sequential{}, __FILE__, __LINE__, 0, 1, [=] (const int) { + care::forall(care::sequential{}, __FILE__, __LINE__, 0, 1, 0, [=] (const int) { #define CARE_CHECKED_HOST_KERNEL_END(CHECK) }); \ CARE_NEST_END(CHECK) } @@ -437,7 +580,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o //////////////////////////////////////////////////////////////////////////////// #define CARE_CHECKED_HOST_KERNEL_WITH_REF_START(CHECK, ...) { \ CARE_NEST_BEGIN(CHECK) \ - care::forall(care::sequential{}, __FILE__, __LINE__, 0, 1, [= FOR_EACH(CARE_REF_CAPTURE, __VA_ARGS__)] (const int) { + care::forall(care::sequential{}, __FILE__, __LINE__, 0, 1, 0, [= FOR_EACH(CARE_REF_CAPTURE, __VA_ARGS__)] (const int) { #define CARE_CHECKED_HOST_KERNEL_WITH_REF_END(CHECK) }); \ CARE_NEST_END(CHECK) } @@ -456,11 +599,31 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_CHECKED_OPENMP_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) { \ if (END_INDEX > START_INDEX) { \ CARE_NEST_BEGIN(CHECK) \ - care::forall(care::openmp{}, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] (const int INDEX) { + care::forall(care::openmp{}, __FILE__, __LINE__, START_INDEX, END_INDEX, 0, [=] (const int INDEX) { #define CARE_CHECKED_OPENMP_LOOP_END(CHECK) }); \ CARE_NEST_END(CHECK) }} +//////////////////////////////////////////////////////////////////////////////// +/// +/// @brief Macros that start and end a chunked OpenMP RAJA loop. If OpenMP is not +/// available, executes sequentially on the host. +/// +/// @arg[in] INDEX The index variable +/// @arg[in] START_INDEX The starting index (inclusive) +/// @arg[in] END_INDEX The ending index (exclusive) +/// @arg[in] CHUNK_SIZE Maximum size of kernel +/// @arg[in] CHECK The variable to check that the start and end macros match +/// +//////////////////////////////////////////////////////////////////////////////// +#define CARE_CHECKED_CHUNKED_OPENMP_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) { \ + if (END_INDEX > START_INDEX) { \ + CARE_NEST_BEGIN(CHECK) \ + care::forall(care::openmp{}, __FILE__, __LINE__, START_INDEX, END_INDEX, CHUNK_SIZE, [=] (const int INDEX) { + +#define CARE_CHECKED_CHUNKED_OPENMP_LOOP_END(CHECK) }); \ + CARE_NEST_END(CHECK) }} + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end an OpenMP RAJA loop that captures some @@ -476,7 +639,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_CHECKED_OPENMP_LOOP_WITH_REF_START(INDEX, START_INDEX, END_INDEX, CHECK, ...) { \ if (END_INDEX > START_INDEX) { \ CARE_NEST_BEGIN(CHECK) \ - care::forall(care::openmp{}, __FILE__, __LINE__, START_INDEX, END_INDEX, [= FOR_EACH(CARE_REF_CAPTURE, __VA_ARGS__)] (const int INDEX) { + care::forall(care::openmp{}, __FILE__, __LINE__, START_INDEX, END_INDEX, 0, [= FOR_EACH(CARE_REF_CAPTURE, __VA_ARGS__)] (const int INDEX) { #define CARE_CHECKED_OPENMP_LOOP_WITH_REF_END(CHECK) }); \ CARE_NEST_END(CHECK) }} @@ -491,7 +654,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o //////////////////////////////////////////////////////////////////////////////// #define CARE_CHECKED_OPENMP_KERNEL_START(CHECK) { \ CARE_NEST_BEGIN(CHECK) \ - care::forall(care::openmp{}, __FILE__, __LINE__, 0, 1, [=] (const int) { + care::forall(care::openmp{}, __FILE__, __LINE__, 0, 1, 0, [=] (const int) { #define CARE_CHECKED_OPENMP_KERNEL_END(CHECK) }); \ CARE_NEST_END(CHECK) } @@ -510,11 +673,31 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_CHECKED_GPU_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) { \ if (END_INDEX > START_INDEX) { \ CARE_NEST_BEGIN(CHECK) \ - care::forall(care::gpu{}, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] CARE_DEVICE (const int INDEX) { + care::forall(care::gpu{}, __FILE__, __LINE__, START_INDEX, END_INDEX, 0, [=] CARE_DEVICE (const int INDEX) { #define CARE_CHECKED_GPU_LOOP_END(CHECK) }); \ CARE_NEST_END(CHECK) }} +//////////////////////////////////////////////////////////////////////////////// +/// +/// @brief Macros that start and end a chunked GPU RAJA loop. If GPU is not available, +/// executes sequentially on the host. +/// +/// @arg[in] INDEX The index variable +/// @arg[in] START_INDEX The starting index (inclusive) +/// @arg[in] END_INDEX The ending index (exclusive) +/// @arg[in] CHUNK_SIZE Maximum kernel size +/// @arg[in] CHECK The variable to check that the start and end macros match +/// +//////////////////////////////////////////////////////////////////////////////// +#define CARE_CHECKED_CHUNKED_GPU_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) { \ + if (END_INDEX > START_INDEX) { \ + CARE_NEST_BEGIN(CHECK) \ + care::forall(care::gpu{}, __FILE__, __LINE__, START_INDEX, END_INDEX, CHUNK_SIZE, [=] CARE_DEVICE (const int INDEX) { + +#define CARE_CHECKED_CHUNKED_GPU_LOOP_END(CHECK) }); \ + CARE_NEST_END(CHECK) }} + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is @@ -525,7 +708,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o //////////////////////////////////////////////////////////////////////////////// #define CARE_CHECKED_GPU_KERNEL_START(CHECK) { \ CARE_NEST_BEGIN(CHECK) \ - care::forall(care::gpu{}, __FILE__, __LINE__, 0, 1, [=] CARE_DEVICE (const int) { + care::forall(care::gpu{}, __FILE__, __LINE__, 0, 1, 0, [=] CARE_DEVICE (const int) { #define CARE_CHECKED_GPU_KERNEL_END(CHECK) }); \ CARE_NEST_END(CHECK) } @@ -544,18 +727,12 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o /// //////////////////////////////////////////////////////////////////////////////// -#ifdef CARE_ENABLE_RACE_DETECTION -#define CARE_SET_THREAD_ID(INDEX) care::DebugPlugin::s_threadID = INDEX ; -#else -#define CARE_SET_THREAD_ID(INDEX) -#endif - #define CARE_CHECKED_POLICY_LOOP_START(POLICY, INDEX, START_INDEX, END_INDEX, CHECK) { \ auto _care_checked_loop_end = END_INDEX; \ decltype(_care_checked_loop_end) _care_checked_loop_begin = START_INDEX; \ if (_care_checked_loop_end > _care_checked_loop_begin) { \ CARE_NEST_BEGIN(CHECK) \ - care::forall(POLICY{}, __FILE__, __LINE__, _care_checked_loop_begin, _care_checked_loop_end, [=] CARE_DEVICE (decltype(_care_checked_loop_end) INDEX) { \ + care::forall(POLICY{}, __FILE__, __LINE__, _care_checked_loop_begin, _care_checked_loop_end, 0, [=] CARE_DEVICE (decltype(_care_checked_loop_end) INDEX) { \ CARE_SET_THREAD_ID(INDEX) #define CARE_CHECKED_POLICY_LOOP_END(CHECK) }); \ @@ -571,6 +748,43 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_CHECKED_REDUCE_LOOP_END(CHECK) CARE_CHECKED_POLICY_LOOP_END(CHECK) +//////////////////////////////////////////////////////////////////////////////// +/// +/// @brief Macros that start and end a chunked parallel RAJA loop. If GPU is available, +/// executes on the device. If GPU is not available but OpenMP is, +/// executes in parallel on the host. Otherwise, executes sequentially +/// on the host. +/// +/// @arg[in] INDEX The index variable +/// @arg[in] START_INDEX The starting index (inclusive) +/// @arg[in] END_INDEX The ending index (exclusive) +/// @arg[in] CHUNK_SIZE Maximum kernel size +/// @arg[in] CHECK The variable to check that the start and end macros match +/// +//////////////////////////////////////////////////////////////////////////////// + +#define CARE_CHECKED_CHUNKED_POLICY_LOOP_START(POLICY, INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) { \ + auto _care_checked_loop_end = END_INDEX; \ + decltype(_care_checked_loop_end) _care_checked_loop_begin = START_INDEX; \ + if (_care_checked_loop_end > _care_checked_loop_begin) { \ + CARE_NEST_BEGIN(CHECK) \ + care::forall(POLICY{}, __FILE__, __LINE__, _care_checked_loop_begin, _care_checked_loop_end, CHUNK_SIZE, [=] CARE_DEVICE (decltype(_care_checked_loop_end) INDEX) { \ + CARE_SET_THREAD_ID(INDEX) + +#define CARE_CHECKED_CHUNKED_POLICY_LOOP_END(CHECK) }); \ + CARE_NEST_END(CHECK) }} + +#define CARE_CHECKED_CHUNKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) \ + CARE_CHECKED_CHUNKED_POLICY_LOOP_START(care::parallel,INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) + +#define CARE_CHECKED_CHUNKED_PARALLEL_LOOP_END(CHECK) CARE_CHECKED_CHUNKED_POLICY_LOOP_END(CHECK) + +#define CARE_CHECKED_CHUNKED_REDUCE_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) \ + CARE_CHECKED_CHUNKED_POLICY_LOOP_START(care::parallel_reduce,INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) + +#define CARE_CHECKED_CHUNKED_REDUCE_LOOP_END(CHECK) CARE_CHECKED_CHUNKED_POLICY_LOOP_END(CHECK) + + //////////////////////////////////////////////////////////////////////////////// /// @@ -582,7 +796,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o //////////////////////////////////////////////////////////////////////////////// #define CARE_CHECKED_PARALLEL_KERNEL_START(CHECK) { \ CARE_NEST_BEGIN(CHECK) \ - care::forall(care::parallel{}, __FILE__, __LINE__, 0, 1, [=] CARE_DEVICE (const int) { + care::forall(care::parallel{}, __FILE__, __LINE__, 0, 1, 0, [=] CARE_DEVICE (const int) { #define CARE_CHECKED_PARALLEL_KERNEL_END(CHECK) }); \ CARE_NEST_END(CHECK) } @@ -605,11 +819,35 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_CHECKED_MANAGED_PTR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) { \ if (END_INDEX > START_INDEX) { \ CARE_NEST_BEGIN(CHECK) \ - care::forall(care::managed_ptr_read{}, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] CARE_MANAGED_PTR_DEVICE (const int INDEX) { + care::forall(care::managed_ptr_read{}, __FILE__, __LINE__, START_INDEX, END_INDEX, 0, [=] CARE_MANAGED_PTR_DEVICE (const int INDEX) { #define CARE_CHECKED_MANAGED_PTR_LOOP_END(CHECK) }); \ CARE_NEST_END(CHECK) }} +//////////////////////////////////////////////////////////////////////////////// +/// +/// @brief Macros that start and end a chunked RAJA loop that uses at least one +/// managed_ptr. If GPU is available, and managed_ptr is available +/// on the device, executes on the device. If GPU is not available +/// but OpenMP is, executes in parallel on the host. Otherwise, +/// executes sequentially on the host. The legacy version uses raw +/// OpenMP. +/// +/// @arg[in] INDEX The index variable +/// @arg[in] START_INDEX The starting index (inclusive) +/// @arg[in] END_INDEX The ending index (exclusive) +/// @arg[in] CHUNK_SIZE Maximum kernel size +/// @arg[in] CHECK The variable to check that the start and end macros match +/// +//////////////////////////////////////////////////////////////////////////////// +#define CARE_CHECKED_CHUNKED_MANAGED_PTR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) { \ + if (END_INDEX > START_INDEX) { \ + CARE_NEST_BEGIN(CHECK) \ + care::forall(care::managed_ptr_read{}, __FILE__, __LINE__, START_INDEX, END_INDEX, CHUNK_SIZE, [=] CARE_MANAGED_PTR_DEVICE (const int INDEX) { + +#define CARE_CHECKED_CHUNKED_MANAGED_PTR_LOOP_END(CHECK) }); \ + CARE_NEST_END(CHECK) }} + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros for updating/initializing managed_ptrs. @@ -622,7 +860,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o //////////////////////////////////////////////////////////////////////////////// #define CARE_CHECKED_MANAGED_PTR_UPDATE_KERNEL_START(CHECK) { \ CARE_NEST_BEGIN(CHECK) \ - care::forall(care::managed_ptr_write{}, __FILE__, __LINE__, 0, 1, [=] CARE_MANAGED_PTR_HOST_DEVICE (const int) { + care::forall(care::managed_ptr_write{}, __FILE__, __LINE__, 0, 1, 0, [=] CARE_MANAGED_PTR_HOST_DEVICE (const int) { #define CARE_CHECKED_MANAGED_PTR_UPDATE_KERNEL_END(CHECK) }); \ CARE_NEST_END(CHECK) } @@ -652,6 +890,21 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_LOOP_END CARE_CHECKED_LOOP_END(care_loop_check) +//////////////////////////////////////////////////////////////////////////////// +/// +/// @brief Macros that start and end a call to chunked forall. +/// +/// @arg[in] POLICY The execution policy to use +/// @arg[in] INDEX The index variable +/// @arg[in] START_INDEX The starting index (inclusive) +/// @arg[in] END_INDEX The ending index (exclusive) +/// @arg[in] CHUNK_SIZE Maximum kernel size +/// +//////////////////////////////////////////////////////////////////////////////// +#define CARE_CHUNKED_LOOP(POLICY, INDEX, START_INDEX, END_INDEX, CHUNK_SIZE) CARE_CHECKED_LOOP_START(POLICY, INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, care_loop_chunked_check) + +#define CARE_CHUNKED_LOOP_END CARE_CHECKED_LOOP_END(care_loop_chunked_check) + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end a sequential RAJA loop. @@ -731,6 +984,21 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_OPENMP_LOOP_END CARE_CHECKED_OPENMP_LOOP_END(care_openmp_loop_check) +//////////////////////////////////////////////////////////////////////////////// +/// +/// @brief Macros that start and end a chunked OpenMP RAJA loop. If OpenMP is not +/// available, executes sequentially on the host. +/// +/// @arg[in] INDEX The index variable +/// @arg[in] START_INDEX The starting index (inclusive) +/// @arg[in] END_INDEX The ending index (exclusive) +/// @arg[in] CHUNK_SIZE Maximum kernel size +/// +//////////////////////////////////////////////////////////////////////////////// +#define CARE_CHUNKED_OPENMP_LOOP(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE) CARE_CHECKED_CHUNKED_OPENMP_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, care_openmp_loop_chunked_check) + +#define CARE_CHUNKED_OPENMP_LOOP_END CARE_CHECKED_CHUNKED_OPENMP_LOOP_END(care_openmp_loop_chunked_check) + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end an OpenMP RAJA loop that captures some @@ -761,6 +1029,22 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_GPU_LOOP_END CARE_CHECKED_GPU_LOOP_END(care_gpu_loop_check) +//////////////////////////////////////////////////////////////////////////////// +/// +/// @brief Macros that start and end a chunked RAJA loop. If GPU is available, +/// executes on the device. If GPU is not available, executes +/// sequentially on the host. +/// +/// @arg[in] INDEX The index variable +/// @arg[in] START_INDEX The starting index (inclusive) +/// @arg[in] END_INDEX The ending index (exclusive) +/// @arg[in] CHUNK_SIZE Maximum kernel size +/// +//////////////////////////////////////////////////////////////////////////////// +#define CARE_CHUNKED_GPU_LOOP(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE) CARE_CHECKED_GPU_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, care_gpu_loop_chunked_check) + +#define CARE_CHUNKED_GPU_LOOP_END CARE_CHECKED_CHUNKED_GPU_LOOP_END(care_gpu_loop_chunked_check) + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end a parallel RAJA loop. If GPU is available, @@ -777,6 +1061,24 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_PARALLEL_LOOP_END CARE_CHECKED_PARALLEL_LOOP_END(care_parallel_loop_check) +//////////////////////////////////////////////////////////////////////////////// +/// +/// @brief Macros that start and end a chunked parallel RAJA loop. If GPU is available, +/// executes on the device. If GPU is not available but OpenMP is, +/// executes in parallel on the host. Otherwise, executes sequentially +/// on the host. +/// +/// @arg[in] INDEX The index variable +/// @arg[in] START_INDEX The starting index (inclusive) +/// @arg[in] END_INDEX The ending index (exclusive) +/// @arg[in] CHUNK_SIZE Maximum kernel size +/// +//////////////////////////////////////////////////////////////////////////////// +#define CARE_CHUNKED_PARALLEL_LOOP(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE) CARE_CHECKED_CHUNKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, care_parallel_loop_chunked_check) + +#define CARE_CHUNKED_PARALLEL_LOOP_END CARE_CHECKED_CHUNKED_PARALLEL_LOOP_END(care_parallel_loop_chunked_check) + + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end a RAJA loop that uses at least one @@ -795,6 +1097,25 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_MANAGED_PTR_LOOP_END CARE_CHECKED_MANAGED_PTR_LOOP_END(care_managed_ptr_read_loop_check) +//////////////////////////////////////////////////////////////////////////////// +/// +/// @brief Macros that start and end a chunked RAJA loop that uses at least one +/// managed_ptr. If GPU is available, and managed_ptr is available +/// on the device, executes on the device. If GPU is not available +/// but OpenMP is, executes in parallel on the host. Otherwise, +/// executes sequentially on the host. The legacy version uses raw +/// OpenMP. +/// +/// @arg[in] INDEX The index variable +/// @arg[in] START_INDEX The starting index (inclusive) +/// @arg[in] END_INDEX The ending index (exclusive) +/// @arg[in] CHUNK_SIZE Maximum kernel size +/// +//////////////////////////////////////////////////////////////////////////////// +#define CARE_CHUNKED_MANAGED_PTR_LOOP(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE) CARE_CHECKED_CHUNKED_MANAGED_PTR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, care_managed_ptr_read_loop_chunked_check) + +#define CARE_CHUNKED_MANAGED_PTR_LOOP_END CARE_CHECKED_CHUNKED_MANAGED_PTR_LOOP_END(care_managed_ptr_read_loop_chunked_check) + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end a parallel RAJA loop. If GPU is available, @@ -813,6 +1134,25 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_WORK_LOOP_END CARE_CHECKED_PARALLEL_LOOP_END(care_work_loop_check) +//////////////////////////////////////////////////////////////////////////////// +/// +/// @brief Macros that start and end a chunked parallel RAJA loop. If GPU is available, +/// executes on the device. If GPU is not available but OpenMP is, +/// executes in parallel on the host. Otherwise, executes sequentially +/// on the host. +/// +/// WORK is an alias to PARALLEL that indicates a lot of work is taking place. +/// +/// @arg[in] INDEX The index variable +/// @arg[in] START_INDEX The starting index (inclusive) +/// @arg[in] END_INDEX The ending index (exclusive) +/// @arg[in] CHUNK_SIZE Maximum kernel size +/// +//////////////////////////////////////////////////////////////////////////////// +#define CARE_CHUNKED_WORK_LOOP(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE) CARE_CHECKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, care_work_loop_chunked_check) + +#define CARE_CHUNKED_WORK_LOOP_END CARE_CHECKED_CHUNKED_PARALLEL_LOOP_END(care_work_loop_chunked_check) + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end a parallel RAJA loop. If GPU is available, @@ -831,6 +1171,24 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_STREAM_LOOP_END CARE_CHECKED_PARALLEL_LOOP_END(care_stream_loop_check) +//////////////////////////////////////////////////////////////////////////////// +/// +/// @brief Macros that start and end a chunked parallel RAJA loop. If GPU is available, +/// executes on the device. If GPU is not available but OpenMP is, +/// executes in parallel on the host. Otherwise, executes sequentially +/// on the host. +/// +/// STREAM is an alias to PARALLEL that indicates not much work is taking place. +/// +/// @arg[in] INDEX The index variable +/// @arg[in] START_INDEX The starting index (inclusive) +/// @arg[in] END_INDEX The ending index (exclusive) +/// @arg[in] CHUNK_SIZE Maximum kernel size +/// +//////////////////////////////////////////////////////////////////////////////// +#define CARE_CHUNKED_STREAM_LOOP(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE) CARE_CHECKED_CHUNKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, care_stream_loop_chunked_check) + +#define CARE_CHUNKED_STREAM_LOOP_END CARE_CHECKED_CHUNKED_PARALLEL_LOOP_END(care_stream_loop_chunked_check) //////////////////////////////////////////////////////////////////////////////// /// @@ -850,6 +1208,25 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_REDUCE_LOOP_END CARE_CHECKED_REDUCE_LOOP_END(care_reduce_loop_check) +//////////////////////////////////////////////////////////////////////////////// +/// +/// @brief Macros that start and end a chunked parallel RAJA loop. If GPU is available, +/// executes on the device. If GPU is not available but OpenMP is, +/// executes in parallel on the host. Otherwise, executes sequentially +/// on the host. +/// +/// REDUCE is an alias to PARALLEL that indicates a reduction is taking place. +/// +/// @arg[in] INDEX The index variable +/// @arg[in] START_INDEX The starting index (inclusive) +/// @arg[in] END_INDEX The ending index (exclusive) +/// @arg[in] CHUNK_SIZE Maximum kernel size +/// +//////////////////////////////////////////////////////////////////////////////// +#define CARE_CHUNKED_REDUCE_LOOP(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE) CARE_CHECKED_REDUCE_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, care_reduce_loop_chunked_check) + +#define CARE_CHUNKED_REDUCE_LOOP_END CARE_CHECKED_CHUNKED_REDUCE_LOOP_END(care_reduce_loop_chunked_check) + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end a sequential RAJA loop of length one. diff --git a/src/care/forall.h b/src/care/forall.h index 22813b1e..b7b79502 100644 --- a/src/care/forall.h +++ b/src/care/forall.h @@ -71,33 +71,51 @@ namespace care { /// @arg[in] lineNumber The line number in the file where this function is called /// @arg[in] start The starting index (inclusive) /// @arg[in] end The ending index (exclusive) + /// @arg[in] batch_size Maximum length of each kernel (0 for no limit) /// @arg[in] body The loop body to execute at each index /// //////////////////////////////////////////////////////////////////////////////// template void forall(ExecutionPolicy /* policy */, const char * fileName, const int lineNumber, - const int start, const int end, LB&& body) { + const int start, const int end, const int batch_size, LB&& body) { const int length = end - start; if (length != 0) { PluginData::setFileName(fileName); PluginData::setLineNumber(lineNumber); + int index = start ; + int chunk_size = batch_size > 0 ? batch_size : length ; + + while (index < end) { + int chunk_start = index ; + int chunk_end = (index + chunk_size < end) ? index + chunk_size : end ; #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS - RAJA::RangeStrideSegment rangeSegment = - s_reverseLoopOrder ? - RAJA::RangeStrideSegment(end - 1, start - 1, -1) : - RAJA::RangeStrideSegment(start, end, 1); + RAJA::RangeStrideSegment rangeSegment = + s_reverseLoopOrder ? + RAJA::RangeStrideSegment(chunk_end - 1, chunk_start - 1, -1) : + RAJA::RangeStrideSegment(chunk_start, chunk_end, 1); #else - RAJA::RangeSegment rangeSegment = RAJA::RangeSegment(start, end); + RAJA::RangeSegment rangeSegment = RAJA::RangeSegment(chunk_start, chunk_end); #endif #if CARE_ENABLE_GPU_SIMULATION_MODE - RAJA::forall(rangeSegment, std::forward(body)); + chai::ArrayManager* threadRM = chai::ArrayManager::getInstance(); + if (ExecutionPolicyToSpace::value == chai::GPU) { + threadRM->setGPUSimMode(true); + } + else { + threadRM->setGPUSimMode(false); + } + RAJA::forall(rangeSegment, std::forward(body)); + threadRM->setGPUSimMode(false); #else - RAJA::forall(rangeSegment, std::forward(body)); + RAJA::forall(rangeSegment, std::forward(body)); #endif + + index += chunk_size ; + } } } @@ -112,13 +130,14 @@ namespace care { /// @arg[in] lineNumber The line number in the file where this function is called /// @arg[in] start The starting index (inclusive) /// @arg[in] end The ending index (exclusive) + /// @arg[in] batch_size Maximum length of each kernel (0 for no limit) /// @arg[in] body The loop body to execute at each index /// //////////////////////////////////////////////////////////////////////////////// template void forall(sequential, const char * fileName, const int lineNumber, - const int start, const int end, LB&& body) { - forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, std::forward(body)); + const int start, const int end, const int batch_size, LB&& body) { + forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, batch_size, std::forward(body)); } //////////////////////////////////////////////////////////////////////////////// @@ -133,20 +152,21 @@ namespace care { /// @arg[in] lineNumber The line number in the file where this function is called /// @arg[in] start The starting index (inclusive) /// @arg[in] end The ending index (exclusive) + /// @arg[in] batch_size Maximum length of each kernel (0 for no limit) /// @arg[in] body The loop body to execute at each index /// //////////////////////////////////////////////////////////////////////////////// template void forall(openmp, const char * fileName, const int lineNumber, - const int start, const int end, LB&& body) { + const int start, const int end, const int batch_size, LB&& body) { #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS s_reverseLoopOrder = true; #endif #if defined(_OPENMP) && defined(RAJA_ENABLE_OPENMP) - forall(RAJA::omp_parallel_for_exec{}, fileName, lineNumber, start, end, std::forward(body)); + forall(RAJA::omp_parallel_for_exec{}, fileName, lineNumber, start, end, batch_size, std::forward(body)); #else - forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, std::forward(body)); + forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, batch_size, std::forward(body)); #endif #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS @@ -166,26 +186,27 @@ namespace care { /// @arg[in] lineNumber The line number in the file where this function is called /// @arg[in] start The starting index (inclusive) /// @arg[in] end The ending index (exclusive) + /// @arg[in] batch_size Maximum length of each kernel (0 for no limit) /// @arg[in] body The loop body to execute at each index /// //////////////////////////////////////////////////////////////////////////////// template void forall(gpu, const char * fileName, const int lineNumber, - const int start, const int end, LB&& body) { + const int start, const int end, const int batch_size, LB&& body) { #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS s_reverseLoopOrder = true; #endif #if CARE_ENABLE_GPU_SIMULATION_MODE - forall(gpu_simulation{}, fileName, lineNumber, start, end, std::forward(body)); + forall(gpu_simulation{}, fileName, lineNumber, start, end, batch_size, std::forward(body)); #elif defined(__CUDACC__) forall(RAJA::cuda_exec{}, - fileName, lineNumber, start, end, std::forward(body)); + fileName, lineNumber, start, end, batch_size, std::forward(body)); #elif defined(__HIPCC__) forall(RAJA::hip_exec{}, - fileName, lineNumber, start, end, std::forward(body)); + fileName, lineNumber, start, end, batch_size, std::forward(body)); #else - forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, std::forward(body)); + forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, batch_size, std::forward(body)); #endif #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS @@ -206,29 +227,30 @@ namespace care { /// @arg[in] lineNumber The line number in the file where this function is called /// @arg[in] start The starting index (inclusive) /// @arg[in] end The ending index (exclusive) + /// @arg[in] batch_size Maximum length of each kernel (0 for no limit) /// @arg[in] body The loop body to execute at each index /// //////////////////////////////////////////////////////////////////////////////// template void forall(parallel, const char * fileName, const int lineNumber, - const int start, const int end, LB&& body) { + const int start, const int end, const int batch_size, LB&& body) { #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS s_reverseLoopOrder = true; #endif PluginData::setParallelContext(true); #if CARE_ENABLE_GPU_SIMULATION_MODE - forall(gpu_simulation{}, fileName, lineNumber, start, end, std::forward(body)); + forall(gpu_simulation{}, fileName, lineNumber, start, end, batch_size, std::forward(body)); #elif defined(__CUDACC__) forall(RAJA::cuda_exec{}, - fileName, lineNumber, start, end, std::forward(body)); + fileName, lineNumber, start, end, batch_size, std::forward(body)); #elif defined(__HIPCC__) forall(RAJA::hip_exec{}, - fileName, lineNumber, start, end, std::forward(body)); + fileName, lineNumber, start, end, batch_size, std::forward(body)); #elif defined(_OPENMP) && defined(RAJA_ENABLE_OPENMP) - forall(RAJA::omp_parallel_for_exec{}, fileName, lineNumber, start, end, std::forward(body)); + forall(RAJA::omp_parallel_for_exec{}, fileName, lineNumber, start, end, batch_size, std::forward(body)); #else - forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, std::forward(body)); + forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, batch_size, std::forward(body)); #endif PluginData::setParallelContext(false); @@ -243,23 +265,24 @@ namespace care { /// /// @brief Execute using the care::RAJAReductionExec policy /// - /// @arg[in] parallel_reducew Used to choose this overload of forall + /// @arg[in] parallel_reduce Used to choose this overload of forall /// @arg[in] fileName The name of the file where this function is called /// @arg[in] lineNumber The line number in the file where this function is called /// @arg[in] start The starting index (inclusive) /// @arg[in] end The ending index (exclusive) + /// @arg[in] batch_size Maximum length of each kernel (0 for no limit) /// @arg[in] body The loop body to execute at each index /// //////////////////////////////////////////////////////////////////////////////// template void forall(parallel_reduce, const char * fileName, const int lineNumber, - const int start, const int end, LB&& body) { + const int start, const int end, const int batch_size, LB&& body) { #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS s_reverseLoopOrder = true; #endif PluginData::setParallelContext(true); - forall(RAJAReductionExec{}, fileName, lineNumber, start, end, std::forward(body)); + forall(RAJAReductionExec{}, fileName, lineNumber, start, end, batch_size, std::forward(body)); PluginData::setParallelContext(false); @@ -282,24 +305,25 @@ namespace care { /// @arg[in] lineNumber The line number in the file where this function is called /// @arg[in] start The starting index (inclusive) /// @arg[in] end The ending index (exclusive) + /// @arg[in] batch_size Maximum length of each kernel (0 for no limit) /// @arg[in] body The loop body to execute at each index /// //////////////////////////////////////////////////////////////////////////////// template void forall(managed_ptr_read, const char * fileName, const int lineNumber, - const int start, const int end, LB&& body) { + const int start, const int end, const int batch_size, LB&& body) { #if CARE_ENABLE_GPU_SIMULATION_MODE && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) - forall(gpu_simulation{}, fileName, lineNumber, start, end, std::forward(body)); + forall(gpu_simulation{}, fileName, lineNumber, start, end, batch_size, std::forward(body)); #elif defined(__CUDACC__) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) forall(RAJA::cuda_exec{}, - fileName, lineNumber, start, end, std::forward(body)); + fileName, lineNumber, start, end, batch_size, std::forward(body)); #elif defined(__HIPCC__) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) forall(RAJA::hip_exec{}, - fileName, lineNumber, start, end, std::forward(body)); + fileName, lineNumber, start, end, batch_size, std::forward(body)); #elif defined(_OPENMP) && defined(RAJA_ENABLE_OPENMP) - forall(RAJA::omp_parallel_for_exec{}, fileName, lineNumber, start, end, std::forward(body)); + forall(RAJA::omp_parallel_for_exec{}, fileName, lineNumber, start, end, batch_size, std::forward(body)); #else - forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, std::forward(body)); + forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, batch_size, std::forward(body)); #endif } @@ -433,9 +457,9 @@ namespace care { //////////////////////////////////////////////////////////////////////////////// template void forall(managed_ptr_write, const char * fileName, int lineNumber, - int start, const int end, LB body) { + const int start, const int end, const int batch_size, LB&& body) { // preLoopPrint and postLoopPrint are handled in this call. - forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, body); + forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, batch_size, body); #if defined(CARE_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) const int length = end - start; @@ -444,12 +468,23 @@ namespace care { chai::ArrayManager* threadRM = chai::ArrayManager::getInstance(); threadRM->setExecutionSpace(chai::GPU); + int index = start ; + int chunk_size = batch_size > 0 ? batch_size : length ; + + while (index < end) { + int chunk_start = index ; + int chunk_end = (index + chunk_size < end) ? index + chunk_size : end ; + RAJA::RangeSegment rangeSegment = RAJA::RangeSegment(chunk_start, chunk_end); + #if defined(__CUDACC__) - RAJA::forall< RAJA::cuda_exec>(RAJA::RangeSegment(start, end), body); + RAJA::forall< RAJA::cuda_exec>(rangeSegment, body); #elif defined(__HIPCC__) - RAJA::forall< RAJA::hip_exec>(RAJA::RangeSegment(start, end), body); + RAJA::forall< RAJA::hip_exec>(rangeSegment, body); #endif + index += chunk_size ; + } + #if FORCE_SYNC && defined(CARE_GPUCC) care::gpuDeviceSynchronize(fileName, lineNumber); #endif @@ -471,30 +506,31 @@ namespace care { /// @arg[in] lineNumber The line number in the file where this function is called /// @arg[in] start The starting index (inclusive) /// @arg[in] end The ending index (exclusive) + /// @arg[in] batch_size Maximum length of each kernel (0 for no limit) /// @arg[in] body The loop body to execute at each index /// //////////////////////////////////////////////////////////////////////////////// template void forall(Policy&& policy, const char * fileName, const int lineNumber, - const int start, const int end, LB&& body) { + const int start, const int end, const int batch_size, LB&& body) { switch (policy) { case Policy::sequential: - forall(sequential{}, fileName, lineNumber, start, end, std::forward(body)); + forall(sequential{}, fileName, lineNumber, start, end, batch_size, std::forward(body)); break; case Policy::openmp: - forall(openmp{}, fileName, lineNumber, start, end, std::forward(body)); + forall(openmp{}, fileName, lineNumber, start, end, batch_size, std::forward(body)); break; case Policy::gpu: - forall(gpu{}, fileName, lineNumber, start, end, std::forward(body)); + forall(gpu{}, fileName, lineNumber, start, end, batch_size, std::forward(body)); break; case Policy::parallel: - forall(parallel{}, fileName, lineNumber, start, end, std::forward(body)); + forall(parallel{}, fileName, lineNumber, start, end, batch_size, std::forward(body)); break; case Policy::managed_ptr_read: - forall(managed_ptr_read{}, fileName, lineNumber, start, end, std::forward(body)); + forall(managed_ptr_read{}, fileName, lineNumber, start, end, batch_size, std::forward(body)); break; case Policy::managed_ptr_write: - forall(managed_ptr_write{}, fileName, lineNumber, start, end, std::forward(body)); + forall(managed_ptr_write{}, fileName, lineNumber, start, end, batch_size, std::forward(body)); break; default: std::cout << "[CARE] Error: Invalid policy!" << std::endl; diff --git a/test/TestForall.cpp b/test/TestForall.cpp index 2b10da77..8cadc3bc 100644 --- a/test/TestForall.cpp +++ b/test/TestForall.cpp @@ -56,6 +56,40 @@ CPU_TEST(forall, dynamic_policy) temp.free(); } +CPU_TEST(forall, chunked_static_policy) +{ + const int batch_size = 3; + const int length = 10; + care::host_device_ptr temp(length, "temp"); + + CARE_CHUNKED_LOOP(care::sequential{}, i, 0, length, batch_size) { + temp[i] = i; + } CARE_CHUNKED_LOOP_END + + CARE_SEQUENTIAL_LOOP(i, 0, length) { + EXPECT_EQ(temp[i], i); + } CARE_SEQUENTIAL_LOOP_END + + temp.free(); +} + +CPU_TEST(forall, chunked_dynamic_policy) +{ + const int batch_size = 3; + const int length = 10; + care::host_device_ptr temp(length, "temp"); + + CARE_LOOP(care::Policy::sequential, i, 0, length, batch_size) { + temp[i] = i; + } CARE_CHUNKED_LOOP_END + + CARE_SEQUENTIAL_LOOP(i, 0, length) { + EXPECT_EQ(temp[i], i); + } CARE_SEQUENTIAL_LOOP_END + + temp.free(); +} + #if defined(CARE_ENABLE_RACE_DETECTION) CPU_TEST(forall, race_condition_detection) { @@ -117,5 +151,40 @@ GPU_TEST(forall, dynamic_policy) temp.free(); } +GPU_TEST(forall, chunked_static_policy) +{ + const int batch_size = 3; + const int length = 10; + care::host_device_ptr temp(length, "temp"); + + CARE_CHUNKED_LOOP(care::gpu{}, i, 0, length, batch_size) { + temp[i] = i; + } CARE_CHUNKED_LOOP_END + + CARE_SEQUENTIAL_LOOP(i, 0, length) { + EXPECT_EQ(temp[i], i); + } CARE_SEQUENTIAL_LOOP_END + + temp.free(); +} + +GPU_TEST(forall, chunked_dynamic_policy) +{ + const int batch_size = 3; + const int length = 10; + care::host_device_ptr temp(length, "temp"); + + CARE_CHUNKED_LOOP(care::Policy::gpu, i, 0, length, batch_size) { + temp[i] = i; + } CARE_CHUNKED_LOOP_END + + CARE_SEQUENTIAL_LOOP(i, 0, length) { + EXPECT_EQ(temp[i], i); + } CARE_SEQUENTIAL_LOOP_END + + temp.free(); +} + + #endif // CARE_GPUCC