diff --git a/src/care/DefaultMacros.h b/src/care/DefaultMacros.h
index 62c48072..6ca2b0b5 100644
--- a/src/care/DefaultMacros.h
+++ b/src/care/DefaultMacros.h
@@ -34,6 +34,12 @@
 /// Used to capture variables by reference into a lambda (combine with FOR_EACH)
 #define CARE_REF_CAPTURE(X) , &X
 
+#ifdef CARE_ENABLE_RACE_DETECTION
+#define CARE_SET_THREAD_ID(INDEX) care::DebugPlugin::s_threadID = INDEX ;
+#else
+#define CARE_SET_THREAD_ID(INDEX)
+#endif
+
 
 
 
@@ -89,6 +95,29 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 
 #define CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK) } OMP_FOR_END CARE_NEST_END(CHECK) }
 
+////////////////////////////////////////////////////////////////////////////////
+///
+/// @brief Macros that start and end a chunked vanilla OpenMP 3.0 for loop.
+///
+/// @arg[in] INDEX The index variable
+/// @arg[in] START_INDEX The starting index (inclusive)
+/// @arg[in] END_INDEX The ending index (exclusive)
+/// @arg[in] CHUNK_SIZE Maximum kernel size
+/// @arg[in] CHECK The variable to check that the start and end macros match
+///
+////////////////////////////////////////////////////////////////////////////////
+#define CARE_CHECKED_CHUNKED_OPENMP_FOR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) {\
+ CARE_NEST_BEGIN(CHECK) \
+ auto const _care_openmp_for_loop_end_ndx = END_INDEX; \
+ decltype(_care_openmp_for_loop_end_ndx) _care_openmp_for_loop_ndx = START_INDEX; \
+ decltype(_care_openmp_for_loop_end_ndx) _care_open_chunked_for_loop_chunk_size = CHUNK_SIZE > 0 ? CHUNK_SIZE : END_INDEX - START_INDEX ; \
+ while (_care_openmp_for_loop_begin_ndx < _care_openmp_for_loop_end_ndx) { \
+    decltype(_care_openmp_for_loop_end_ndx) _care_openmp_for_loop_chunk_begin_ndx = _care_openmp_for_loop_ndx ; \
+    decltype(_care_openmp_for_loop_end_ndx) _care_openmp_for_loop_chunk_end_ndx = (_care_openmp_for_loop_ndx + _care_open_chunked_for_loop_chunk_size) ? _care_openmp_for_loop_ndx + _care_open_chunked_for_loop_chunk_size : _care_openmp_for_loop_end_ndx ; \
+OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_chunk_begin_ndx; INDEX < _care_openmp_for_loop_chunk_end_ndx; ++INDEX) {\
+
+#define CARE_CHECKED_CHUNKED_OPENMP_FOR_LOOP_END(CHECK) } OMP_FOR_END } CARE_NEST_END(CHECK) }
+
 
 
 
@@ -133,6 +162,24 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 
 #define CARE_CHECKED_LOOP_END(CHECK) CARE_CHECKED_FOR_LOOP_END(CHECK)
 
+////////////////////////////////////////////////////////////////////////////////
+///
+/// @brief Macros that start and end a call to forall with the given execution policy.
+///        This is for compatibility with chunked GPU loops.
+///        The legacy version uses a raw for loop.
+///
+/// @arg[in] POLICY The execution policy
+/// @arg[in] INDEX The index variable
+/// @arg[in] START_INDEX The starting index (inclusive)
+/// @arg[in] END_INDEX The ending index (exclusive)
+/// @arg[in] CHUNK_SIZE Not used
+/// @arg[in] CHECK The variable to check that the start and end macros match
+///
+////////////////////////////////////////////////////////////////////////////////
+#define CARE_CHECKED_CHUNKED_LOOP_START(POLICY, INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) CARE_CHECKED_FOR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK)
+
+#define CARE_CHECKED_CHUNKED_LOOP_END(CHECK) CARE_CHECKED_FOR_LOOP_END(CHECK)
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a sequential RAJA loop. The legacy version
@@ -206,6 +253,23 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 
 #define CARE_CHECKED_OPENMP_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK)
 
+////////////////////////////////////////////////////////////////////////////////
+///
+/// @brief Macros that start and end a chunked OpenMP RAJA loop. If OpenMP is not
+///        available, executes sequentially on the host. The legacy version
+///        uses raw OpenMP.
+///
+/// @arg[in] INDEX The index variable
+/// @arg[in] START_INDEX The starting index (inclusive)
+/// @arg[in] END_INDEX The ending index (exclusive)
+/// @arg[in] CHUNK_SIZE Maximum kernel size
+/// @arg[in] CHECK The variable to check that the start and end macros match
+///
+////////////////////////////////////////////////////////////////////////////////
+#define CARE_CHECKED_CHUNKED_OPENMP_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) CARE_CHECKED_CHUNKED_OPENMP_FOR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK)
+
+#define CARE_CHECKED_CHUNKED_OPENMP_LOOP_END(CHECK) CARE_CHECKED_CHUNKED_OPENMP_FOR_LOOP_END(CHECK)
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end an OpenMP RAJA loop that captures some
@@ -237,6 +301,22 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 
 #define CARE_CHECKED_GPU_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK)
 
+////////////////////////////////////////////////////////////////////////////////
+///
+/// @brief Macros that start and end a chunked GPU RAJA loop. If GPU is not available,
+///        executes sequentially on the host. The legacy version uses raw OpenMP.
+///
+/// @arg[in] INDEX The index variable
+/// @arg[in] START_INDEX The starting index (inclusive)
+/// @arg[in] END_INDEX The ending index (exclusive)
+/// @arg[in] CHUNK_SIZE Maximum kernel size
+/// @arg[in] CHECK The variable to check that the start and end macros match
+///
+////////////////////////////////////////////////////////////////////////////////
+#define CARE_CHECKED_CHUNKED_GPU_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK)
+
+#define CARE_CHECKED_CHUNKED_GPU_LOOP_END(CHECK) CARE_CHECKED_CHUNKED_OPENMP_FOR_LOOP_END(CHECK)
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is
@@ -272,6 +352,29 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 
 #define CARE_CHECKED_REDUCE_LOOP_END(CHECK) CARE_CHECKED_PARALLEL_LOOP_END(CHECK)
 
+////////////////////////////////////////////////////////////////////////////////
+///
+/// @brief Macros that start and end a chunked parallel RAJA loop. If GPU is available,
+///        executes on the device. If GPU is not available but OpenMP is,
+///        executes in parallel on the host. Otherwise, executes sequentially
+///        on the host. The legacy version uses raw OpenMP.
+///
+/// @arg[in] INDEX The index variable
+/// @arg[in] START_INDEX The starting index (inclusive)
+/// @arg[in] END_INDEX The ending index (exclusive)
+/// @arg[in] CHUNK_SIZE Maximum kernel size
+/// @arg[in] CHECK The variable to check that the start and end macros match
+///
+////////////////////////////////////////////////////////////////////////////////
+#define CARE_CHECKED_CHUNKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK)
+
+#define CARE_CHECKED_CHUNKED_PARALLEL_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK)
+
+#define CARE_CHECKED_CHUNKED_REDUCE_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) \
+   CARE_CHECKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK)
+
+#define CARE_CHECKED_CHUNKED_REDUCE_LOOP_END(CHECK) CARE_CHECKED_CHUNKED_PARALLEL_LOOP_END(CHECK)
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is
@@ -305,6 +408,26 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 
 #define CARE_CHECKED_MANAGED_PTR_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK)
 
+////////////////////////////////////////////////////////////////////////////////
+///
+/// @brief Macros that start and end a chunked RAJA loop that uses at least one
+///        managed_ptr. If GPU is available, and managed_ptr is available
+///        on the device, executes on the device. If GPU is not available
+///        but OpenMP is, executes in parallel on the host. Otherwise,
+///        executes sequentially on the host. The legacy version uses raw
+///        OpenMP.
+///
+/// @arg[in] INDEX The index variable
+/// @arg[in] START_INDEX The starting index (inclusive)
+/// @arg[in] END_INDEX The ending index (exclusive)
+/// @arg[in] CHUNK_SIZE Maximum kernel size
+/// @arg[in] CHECK The variable to check that the start and end macros match
+///
+////////////////////////////////////////////////////////////////////////////////
+#define CARE_CHECKED_CHUNKED_MANAGED_PTR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) CARE_CHECKED_CHUNKED_OPENMP_FOR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK)
+
+#define CARE_CHECKED_CHUNKED_MANAGED_PTR_LOOP_END(CHECK) CARE_CHECKED_CHUNKED_OPENMP_FOR_LOOP_END(CHECK)
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros for updating/initializing managed_ptrs.
@@ -369,11 +492,31 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 #define CARE_CHECKED_LOOP_START(POLICY, INDEX, START_INDEX, END_INDEX, CHECK) { \
    if (END_INDEX > START_INDEX) { \
       CARE_NEST_BEGIN(CHECK) \
-      care::forall(POLICY, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] CARE_HOST_DEVICE (const int INDEX) {
+      care::forall(POLICY, __FILE__, __LINE__, START_INDEX, END_INDEX, 0, [=] CARE_HOST_DEVICE (const int INDEX) {
 
 #define CARE_CHECKED_LOOP_END(CHECK) }); \
    CARE_NEST_END(CHECK) }}
 
+////////////////////////////////////////////////////////////////////////////////
+///
+/// @brief Macros that start and end a call to a chunked forall with the given execution policy.
+///
+/// @arg[in] POLICY The execution policy
+/// @arg[in] INDEX The index variable
+/// @arg[in] START_INDEX The starting index (inclusive)
+/// @arg[in] END_INDEX The ending index (exclusive)
+/// @arg[in] CHUNK_SIZE Maximum chunk size for each kernel
+/// @arg[in] CHECK The variable to check that the start and end macros match
+///
+////////////////////////////////////////////////////////////////////////////////
+#define CARE_CHECKED_CHUNKED_LOOP_START(POLICY, INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) { \
+   if (END_INDEX > START_INDEX) { \
+      CARE_NEST_BEGIN(CHECK) \
+      care::forall(POLICY, __FILE__, __LINE__, START_INDEX, END_INDEX, CHUNK_SIZE, [=] CARE_HOST_DEVICE (const int INDEX) {
+
+#define CARE_CHECKED_CHUNKED_LOOP_END(CHECK) }); \
+   CARE_NEST_END(CHECK) }}
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a sequential RAJA loop.
@@ -387,7 +530,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 #define CARE_CHECKED_SEQUENTIAL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) { \
    if (END_INDEX > START_INDEX) { \
       CARE_NEST_BEGIN(CHECK) \
-      care::forall(care::sequential{}, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] (const int INDEX) {
+      care::forall(care::sequential{}, __FILE__, __LINE__, START_INDEX, END_INDEX, 0, [=] (const int INDEX) {
 
 #define CARE_CHECKED_SEQUENTIAL_LOOP_END(CHECK) }); \
    CARE_NEST_END(CHECK) }}
@@ -407,7 +550,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 #define CARE_CHECKED_SEQUENTIAL_LOOP_WITH_REF_START(INDEX, START_INDEX, END_INDEX, CHECK, ...) { \
    if (END_INDEX > START_INDEX) { \
       CARE_NEST_BEGIN(CHECK) \
-      care::forall(care::sequential{}, __FILE__, __LINE__, START_INDEX, END_INDEX, [= FOR_EACH(CARE_REF_CAPTURE, __VA_ARGS__)] (const int INDEX) {
+      care::forall(care::sequential{}, __FILE__, __LINE__, START_INDEX, END_INDEX, 0, [= FOR_EACH(CARE_REF_CAPTURE, __VA_ARGS__)] (const int INDEX) {
 
 #define CARE_CHECKED_SEQUENTIAL_LOOP_WITH_REF_END(CHECK) }); \
    CARE_NEST_END(CHECK) }}
@@ -421,7 +564,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 ////////////////////////////////////////////////////////////////////////////////
 #define CARE_CHECKED_HOST_KERNEL_START(CHECK) { \
    CARE_NEST_BEGIN(CHECK) \
-   care::forall(care::sequential{}, __FILE__, __LINE__, 0, 1, [=] (const int) {
+   care::forall(care::sequential{}, __FILE__, __LINE__, 0, 1, 0, [=] (const int) {
 
 #define CARE_CHECKED_HOST_KERNEL_END(CHECK) }); \
    CARE_NEST_END(CHECK) }
@@ -437,7 +580,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 ////////////////////////////////////////////////////////////////////////////////
 #define CARE_CHECKED_HOST_KERNEL_WITH_REF_START(CHECK, ...) { \
    CARE_NEST_BEGIN(CHECK) \
-   care::forall(care::sequential{}, __FILE__, __LINE__, 0, 1, [= FOR_EACH(CARE_REF_CAPTURE, __VA_ARGS__)] (const int) {
+   care::forall(care::sequential{}, __FILE__, __LINE__, 0, 1, 0, [= FOR_EACH(CARE_REF_CAPTURE, __VA_ARGS__)] (const int) {
 
 #define CARE_CHECKED_HOST_KERNEL_WITH_REF_END(CHECK) }); \
    CARE_NEST_END(CHECK) }
@@ -456,11 +599,31 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 #define CARE_CHECKED_OPENMP_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) { \
    if (END_INDEX > START_INDEX) { \
       CARE_NEST_BEGIN(CHECK) \
-      care::forall(care::openmp{}, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] (const int INDEX) {
+      care::forall(care::openmp{}, __FILE__, __LINE__, START_INDEX, END_INDEX, 0, [=] (const int INDEX) {
 
 #define CARE_CHECKED_OPENMP_LOOP_END(CHECK) }); \
    CARE_NEST_END(CHECK) }}
 
+////////////////////////////////////////////////////////////////////////////////
+///
+/// @brief Macros that start and end a chunked OpenMP RAJA loop. If OpenMP is not
+///        available, executes sequentially on the host.
+///
+/// @arg[in] INDEX The index variable
+/// @arg[in] START_INDEX The starting index (inclusive)
+/// @arg[in] END_INDEX The ending index (exclusive)
+/// @arg[in] CHUNK_SIZE Maximum size of kernel
+/// @arg[in] CHECK The variable to check that the start and end macros match
+///
+////////////////////////////////////////////////////////////////////////////////
+#define CARE_CHECKED_CHUNKED_OPENMP_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) { \
+   if (END_INDEX > START_INDEX) { \
+      CARE_NEST_BEGIN(CHECK) \
+      care::forall(care::openmp{}, __FILE__, __LINE__, START_INDEX, END_INDEX, CHUNK_SIZE, [=] (const int INDEX) {
+
+#define CARE_CHECKED_CHUNKED_OPENMP_LOOP_END(CHECK) }); \
+   CARE_NEST_END(CHECK) }}
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end an OpenMP RAJA loop that captures some
@@ -476,7 +639,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 #define CARE_CHECKED_OPENMP_LOOP_WITH_REF_START(INDEX, START_INDEX, END_INDEX, CHECK, ...) { \
    if (END_INDEX > START_INDEX) { \
       CARE_NEST_BEGIN(CHECK) \
-      care::forall(care::openmp{}, __FILE__, __LINE__, START_INDEX, END_INDEX, [= FOR_EACH(CARE_REF_CAPTURE, __VA_ARGS__)] (const int INDEX) {
+      care::forall(care::openmp{}, __FILE__, __LINE__, START_INDEX, END_INDEX, 0, [= FOR_EACH(CARE_REF_CAPTURE, __VA_ARGS__)] (const int INDEX) {
 
 #define CARE_CHECKED_OPENMP_LOOP_WITH_REF_END(CHECK) }); \
    CARE_NEST_END(CHECK) }}
@@ -491,7 +654,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 ////////////////////////////////////////////////////////////////////////////////
 #define CARE_CHECKED_OPENMP_KERNEL_START(CHECK) { \
    CARE_NEST_BEGIN(CHECK) \
-   care::forall(care::openmp{}, __FILE__, __LINE__, 0, 1, [=] (const int) {
+   care::forall(care::openmp{}, __FILE__, __LINE__, 0, 1, 0, [=] (const int) {
 
 #define CARE_CHECKED_OPENMP_KERNEL_END(CHECK) }); \
    CARE_NEST_END(CHECK) }
@@ -510,11 +673,31 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 #define CARE_CHECKED_GPU_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) { \
    if (END_INDEX > START_INDEX) { \
       CARE_NEST_BEGIN(CHECK) \
-      care::forall(care::gpu{}, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] CARE_DEVICE (const int INDEX) {
+      care::forall(care::gpu{}, __FILE__, __LINE__, START_INDEX, END_INDEX, 0, [=] CARE_DEVICE (const int INDEX) {
 
 #define CARE_CHECKED_GPU_LOOP_END(CHECK) }); \
    CARE_NEST_END(CHECK) }}
 
+////////////////////////////////////////////////////////////////////////////////
+///
+/// @brief Macros that start and end a chunked GPU RAJA loop. If GPU is not available,
+///        executes sequentially on the host.
+///
+/// @arg[in] INDEX The index variable
+/// @arg[in] START_INDEX The starting index (inclusive)
+/// @arg[in] END_INDEX The ending index (exclusive)
+/// @arg[in] CHUNK_SIZE Maximum kernel size
+/// @arg[in] CHECK The variable to check that the start and end macros match
+///
+////////////////////////////////////////////////////////////////////////////////
+#define CARE_CHECKED_CHUNKED_GPU_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) { \
+   if (END_INDEX > START_INDEX) { \
+      CARE_NEST_BEGIN(CHECK) \
+      care::forall(care::gpu{}, __FILE__, __LINE__, START_INDEX, END_INDEX, CHUNK_SIZE, [=] CARE_DEVICE (const int INDEX) {
+
+#define CARE_CHECKED_CHUNKED_GPU_LOOP_END(CHECK) }); \
+   CARE_NEST_END(CHECK) }}
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is
@@ -525,7 +708,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 ////////////////////////////////////////////////////////////////////////////////
 #define CARE_CHECKED_GPU_KERNEL_START(CHECK) { \
    CARE_NEST_BEGIN(CHECK) \
-   care::forall(care::gpu{}, __FILE__, __LINE__, 0, 1, [=] CARE_DEVICE (const int) {
+   care::forall(care::gpu{}, __FILE__, __LINE__, 0, 1, 0, [=] CARE_DEVICE (const int) {
 
 #define CARE_CHECKED_GPU_KERNEL_END(CHECK) }); \
    CARE_NEST_END(CHECK) }
@@ -544,18 +727,12 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 ///
 ////////////////////////////////////////////////////////////////////////////////
 
-#ifdef CARE_ENABLE_RACE_DETECTION
-#define CARE_SET_THREAD_ID(INDEX) care::DebugPlugin::s_threadID = INDEX ;
-#else
-#define CARE_SET_THREAD_ID(INDEX)
-#endif
-
 #define CARE_CHECKED_POLICY_LOOP_START(POLICY, INDEX, START_INDEX, END_INDEX, CHECK) { \
    auto _care_checked_loop_end = END_INDEX; \
    decltype(_care_checked_loop_end) _care_checked_loop_begin = START_INDEX; \
    if (_care_checked_loop_end > _care_checked_loop_begin) { \
       CARE_NEST_BEGIN(CHECK) \
-      care::forall(POLICY{}, __FILE__, __LINE__, _care_checked_loop_begin, _care_checked_loop_end, [=] CARE_DEVICE (decltype(_care_checked_loop_end) INDEX) { \
+      care::forall(POLICY{}, __FILE__, __LINE__, _care_checked_loop_begin, _care_checked_loop_end, 0, [=] CARE_DEVICE (decltype(_care_checked_loop_end) INDEX) { \
          CARE_SET_THREAD_ID(INDEX)
 
 #define CARE_CHECKED_POLICY_LOOP_END(CHECK) }); \
@@ -571,6 +748,43 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 
 #define CARE_CHECKED_REDUCE_LOOP_END(CHECK) CARE_CHECKED_POLICY_LOOP_END(CHECK)
 
+////////////////////////////////////////////////////////////////////////////////
+///
+/// @brief Macros that start and end a chunked parallel RAJA loop. If GPU is available,
+///        executes on the device. If GPU is not available but OpenMP is,
+///        executes in parallel on the host. Otherwise, executes sequentially
+///        on the host.
+///
+/// @arg[in] INDEX The index variable
+/// @arg[in] START_INDEX The starting index (inclusive)
+/// @arg[in] END_INDEX The ending index (exclusive)
+/// @arg[in] CHUNK_SIZE Maximum kernel size
+/// @arg[in] CHECK The variable to check that the start and end macros match
+///
+////////////////////////////////////////////////////////////////////////////////
+
+#define CARE_CHECKED_CHUNKED_POLICY_LOOP_START(POLICY, INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) { \
+   auto _care_checked_loop_end = END_INDEX; \
+   decltype(_care_checked_loop_end) _care_checked_loop_begin = START_INDEX; \
+   if (_care_checked_loop_end > _care_checked_loop_begin) { \
+      CARE_NEST_BEGIN(CHECK) \
+      care::forall(POLICY{}, __FILE__, __LINE__, _care_checked_loop_begin, _care_checked_loop_end, CHUNK_SIZE, [=] CARE_DEVICE (decltype(_care_checked_loop_end) INDEX) { \
+         CARE_SET_THREAD_ID(INDEX)
+
+#define CARE_CHECKED_CHUNKED_POLICY_LOOP_END(CHECK) }); \
+   CARE_NEST_END(CHECK) }}
+
+#define CARE_CHECKED_CHUNKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) \
+   CARE_CHECKED_CHUNKED_POLICY_LOOP_START(care::parallel,INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK)
+
+#define CARE_CHECKED_CHUNKED_PARALLEL_LOOP_END(CHECK) CARE_CHECKED_CHUNKED_POLICY_LOOP_END(CHECK)
+
+#define CARE_CHECKED_CHUNKED_REDUCE_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) \
+   CARE_CHECKED_CHUNKED_POLICY_LOOP_START(care::parallel_reduce,INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK)
+
+#define CARE_CHECKED_CHUNKED_REDUCE_LOOP_END(CHECK) CARE_CHECKED_CHUNKED_POLICY_LOOP_END(CHECK)
+
+
 
 ////////////////////////////////////////////////////////////////////////////////
 ///
@@ -582,7 +796,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 ////////////////////////////////////////////////////////////////////////////////
 #define CARE_CHECKED_PARALLEL_KERNEL_START(CHECK) { \
    CARE_NEST_BEGIN(CHECK) \
-   care::forall(care::parallel{}, __FILE__, __LINE__, 0, 1, [=] CARE_DEVICE (const int) {
+   care::forall(care::parallel{}, __FILE__, __LINE__, 0, 1, 0, [=] CARE_DEVICE (const int) {
 
 #define CARE_CHECKED_PARALLEL_KERNEL_END(CHECK) }); \
    CARE_NEST_END(CHECK) }
@@ -605,11 +819,35 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 #define CARE_CHECKED_MANAGED_PTR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) { \
    if (END_INDEX > START_INDEX) { \
       CARE_NEST_BEGIN(CHECK) \
-      care::forall(care::managed_ptr_read{}, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] CARE_MANAGED_PTR_DEVICE (const int INDEX) {
+      care::forall(care::managed_ptr_read{}, __FILE__, __LINE__, START_INDEX, END_INDEX, 0, [=] CARE_MANAGED_PTR_DEVICE (const int INDEX) {
 
 #define CARE_CHECKED_MANAGED_PTR_LOOP_END(CHECK) }); \
    CARE_NEST_END(CHECK) }}
 
+////////////////////////////////////////////////////////////////////////////////
+///
+/// @brief Macros that start and end a chunked RAJA loop that uses at least one
+///        managed_ptr. If GPU is available, and managed_ptr is available
+///        on the device, executes on the device. If GPU is not available
+///        but OpenMP is, executes in parallel on the host. Otherwise,
+///        executes sequentially on the host. The legacy version uses raw
+///        OpenMP.
+///
+/// @arg[in] INDEX The index variable
+/// @arg[in] START_INDEX The starting index (inclusive)
+/// @arg[in] END_INDEX The ending index (exclusive)
+/// @arg[in] CHUNK_SIZE Maximum kernel size
+/// @arg[in] CHECK The variable to check that the start and end macros match
+///
+////////////////////////////////////////////////////////////////////////////////
+#define CARE_CHECKED_CHUNKED_MANAGED_PTR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) { \
+   if (END_INDEX > START_INDEX) { \
+      CARE_NEST_BEGIN(CHECK) \
+      care::forall(care::managed_ptr_read{}, __FILE__, __LINE__, START_INDEX, END_INDEX, CHUNK_SIZE, [=] CARE_MANAGED_PTR_DEVICE (const int INDEX) {
+
+#define CARE_CHECKED_CHUNKED_MANAGED_PTR_LOOP_END(CHECK) }); \
+   CARE_NEST_END(CHECK) }}
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros for updating/initializing managed_ptrs.
@@ -622,7 +860,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 ////////////////////////////////////////////////////////////////////////////////
 #define CARE_CHECKED_MANAGED_PTR_UPDATE_KERNEL_START(CHECK) { \
    CARE_NEST_BEGIN(CHECK) \
-   care::forall(care::managed_ptr_write{}, __FILE__, __LINE__, 0, 1, [=] CARE_MANAGED_PTR_HOST_DEVICE (const int) {
+   care::forall(care::managed_ptr_write{}, __FILE__, __LINE__, 0, 1, 0, [=] CARE_MANAGED_PTR_HOST_DEVICE (const int) {
 
 #define CARE_CHECKED_MANAGED_PTR_UPDATE_KERNEL_END(CHECK) }); \
    CARE_NEST_END(CHECK) }
@@ -652,6 +890,21 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 
 #define CARE_LOOP_END CARE_CHECKED_LOOP_END(care_loop_check)
 
+////////////////////////////////////////////////////////////////////////////////
+///
+/// @brief Macros that start and end a call to chunked forall.
+///
+/// @arg[in] POLICY The execution policy to use
+/// @arg[in] INDEX The index variable
+/// @arg[in] START_INDEX The starting index (inclusive)
+/// @arg[in] END_INDEX The ending index (exclusive)
+/// @arg[in] CHUNK_SIZE Maximum kernel size
+///
+////////////////////////////////////////////////////////////////////////////////
+#define CARE_CHUNKED_LOOP(POLICY, INDEX, START_INDEX, END_INDEX, CHUNK_SIZE) CARE_CHECKED_LOOP_START(POLICY, INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, care_loop_chunked_check)
+
+#define CARE_CHUNKED_LOOP_END CARE_CHECKED_LOOP_END(care_loop_chunked_check)
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a sequential RAJA loop.
@@ -731,6 +984,21 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 
 #define CARE_OPENMP_LOOP_END CARE_CHECKED_OPENMP_LOOP_END(care_openmp_loop_check)
 
+////////////////////////////////////////////////////////////////////////////////
+///
+/// @brief Macros that start and end a chunked OpenMP RAJA loop. If OpenMP is not
+///        available, executes sequentially on the host.
+///
+/// @arg[in] INDEX The index variable
+/// @arg[in] START_INDEX The starting index (inclusive)
+/// @arg[in] END_INDEX The ending index (exclusive)
+/// @arg[in] CHUNK_SIZE Maximum kernel size
+///
+////////////////////////////////////////////////////////////////////////////////
+#define CARE_CHUNKED_OPENMP_LOOP(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE) CARE_CHECKED_CHUNKED_OPENMP_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, care_openmp_loop_chunked_check)
+
+#define CARE_CHUNKED_OPENMP_LOOP_END CARE_CHECKED_CHUNKED_OPENMP_LOOP_END(care_openmp_loop_chunked_check)
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end an OpenMP RAJA loop that captures some
@@ -761,6 +1029,22 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 
 #define CARE_GPU_LOOP_END CARE_CHECKED_GPU_LOOP_END(care_gpu_loop_check)
 
+////////////////////////////////////////////////////////////////////////////////
+///
+/// @brief Macros that start and end a chunked RAJA loop. If GPU is available,
+///        executes on the device. If GPU is not available, executes
+///        sequentially on the host.
+///
+/// @arg[in] INDEX The index variable
+/// @arg[in] START_INDEX The starting index (inclusive)
+/// @arg[in] END_INDEX The ending index (exclusive)
+/// @arg[in] CHUNK_SIZE Maximum kernel size
+///
+////////////////////////////////////////////////////////////////////////////////
+#define CARE_CHUNKED_GPU_LOOP(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE) CARE_CHECKED_GPU_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, care_gpu_loop_chunked_check)
+
+#define CARE_CHUNKED_GPU_LOOP_END CARE_CHECKED_CHUNKED_GPU_LOOP_END(care_gpu_loop_chunked_check)
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a parallel RAJA loop. If GPU is available,
@@ -777,6 +1061,24 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 
 #define CARE_PARALLEL_LOOP_END CARE_CHECKED_PARALLEL_LOOP_END(care_parallel_loop_check)
 
+////////////////////////////////////////////////////////////////////////////////
+///
+/// @brief Macros that start and end a chunked parallel RAJA loop. If GPU is available,
+///        executes on the device. If GPU is not available but OpenMP is,
+///        executes in parallel on the host. Otherwise, executes sequentially
+///        on the host.
+///
+/// @arg[in] INDEX The index variable
+/// @arg[in] START_INDEX The starting index (inclusive)
+/// @arg[in] END_INDEX The ending index (exclusive)
+/// @arg[in] CHUNK_SIZE Maximum kernel size
+///
+////////////////////////////////////////////////////////////////////////////////
+#define CARE_CHUNKED_PARALLEL_LOOP(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE) CARE_CHECKED_CHUNKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, care_parallel_loop_chunked_check)
+
+#define CARE_CHUNKED_PARALLEL_LOOP_END CARE_CHECKED_CHUNKED_PARALLEL_LOOP_END(care_parallel_loop_chunked_check)
+
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a RAJA loop that uses at least one
@@ -795,6 +1097,25 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 
 #define CARE_MANAGED_PTR_LOOP_END CARE_CHECKED_MANAGED_PTR_LOOP_END(care_managed_ptr_read_loop_check)
 
+////////////////////////////////////////////////////////////////////////////////
+///
+/// @brief Macros that start and end a chunked RAJA loop that uses at least one
+///        managed_ptr. If GPU is available, and managed_ptr is available
+///        on the device, executes on the device. If GPU is not available
+///        but OpenMP is, executes in parallel on the host. Otherwise,
+///        executes sequentially on the host. The legacy version uses raw
+///        OpenMP.
+///
+/// @arg[in] INDEX The index variable
+/// @arg[in] START_INDEX The starting index (inclusive)
+/// @arg[in] END_INDEX The ending index (exclusive)
+/// @arg[in] CHUNK_SIZE Maximum kernel size
+///
+////////////////////////////////////////////////////////////////////////////////
+#define CARE_CHUNKED_MANAGED_PTR_LOOP(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE) CARE_CHECKED_CHUNKED_MANAGED_PTR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, care_managed_ptr_read_loop_chunked_check)
+
+#define CARE_CHUNKED_MANAGED_PTR_LOOP_END CARE_CHECKED_CHUNKED_MANAGED_PTR_LOOP_END(care_managed_ptr_read_loop_chunked_check)
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a parallel RAJA loop. If GPU is available,
@@ -813,6 +1134,25 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 
 #define CARE_WORK_LOOP_END CARE_CHECKED_PARALLEL_LOOP_END(care_work_loop_check)
 
+////////////////////////////////////////////////////////////////////////////////
+///
+/// @brief Macros that start and end a chunked parallel RAJA loop. If GPU is available,
+///        executes on the device. If GPU is not available but OpenMP is,
+///        executes in parallel on the host. Otherwise, executes sequentially
+///        on the host.
+///
+///        WORK is an alias to PARALLEL that indicates a lot of work is taking place.
+///
+/// @arg[in] INDEX The index variable
+/// @arg[in] START_INDEX The starting index (inclusive)
+/// @arg[in] END_INDEX The ending index (exclusive)
+/// @arg[in] CHUNK_SIZE Maximum kernel size
+///
+////////////////////////////////////////////////////////////////////////////////
+#define CARE_CHUNKED_WORK_LOOP(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE) CARE_CHECKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, care_work_loop_chunked_check)
+
+#define CARE_CHUNKED_WORK_LOOP_END CARE_CHECKED_CHUNKED_PARALLEL_LOOP_END(care_work_loop_chunked_check)
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a parallel RAJA loop. If GPU is available,
@@ -831,6 +1171,24 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 
 #define CARE_STREAM_LOOP_END CARE_CHECKED_PARALLEL_LOOP_END(care_stream_loop_check)
 
+////////////////////////////////////////////////////////////////////////////////
+///
+/// @brief Macros that start and end a chunked parallel RAJA loop. If GPU is available,
+///        executes on the device. If GPU is not available but OpenMP is,
+///        executes in parallel on the host. Otherwise, executes sequentially
+///        on the host.
+///
+///        STREAM is an alias to PARALLEL that indicates not much work is taking place.
+///
+/// @arg[in] INDEX The index variable
+/// @arg[in] START_INDEX The starting index (inclusive)
+/// @arg[in] END_INDEX The ending index (exclusive)
+/// @arg[in] CHUNK_SIZE Maximum kernel size
+///
+////////////////////////////////////////////////////////////////////////////////
+#define CARE_CHUNKED_STREAM_LOOP(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE) CARE_CHECKED_CHUNKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, care_stream_loop_chunked_check)
+
+#define CARE_CHUNKED_STREAM_LOOP_END CARE_CHECKED_CHUNKED_PARALLEL_LOOP_END(care_stream_loop_chunked_check)
 
 ////////////////////////////////////////////////////////////////////////////////
 ///
@@ -850,6 +1208,25 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
 
 #define CARE_REDUCE_LOOP_END CARE_CHECKED_REDUCE_LOOP_END(care_reduce_loop_check)
 
+////////////////////////////////////////////////////////////////////////////////
+///
+/// @brief Macros that start and end a chunked parallel RAJA loop. If GPU is available,
+///        executes on the device. If GPU is not available but OpenMP is,
+///        executes in parallel on the host. Otherwise, executes sequentially
+///        on the host.
+///
+///        REDUCE is an alias to PARALLEL that indicates a reduction is taking place.
+///
+/// @arg[in] INDEX The index variable
+/// @arg[in] START_INDEX The starting index (inclusive)
+/// @arg[in] END_INDEX The ending index (exclusive)
+/// @arg[in] CHUNK_SIZE Maximum kernel size
+///
+////////////////////////////////////////////////////////////////////////////////
+#define CARE_CHUNKED_REDUCE_LOOP(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE) CARE_CHECKED_REDUCE_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, care_reduce_loop_chunked_check)
+
+#define CARE_CHUNKED_REDUCE_LOOP_END CARE_CHECKED_CHUNKED_REDUCE_LOOP_END(care_reduce_loop_chunked_check)
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a sequential RAJA loop of length one.
diff --git a/src/care/forall.h b/src/care/forall.h
index 22813b1e..b7b79502 100644
--- a/src/care/forall.h
+++ b/src/care/forall.h
@@ -71,33 +71,51 @@ namespace care {
    /// @arg[in] lineNumber The line number in the file where this function is called
    /// @arg[in] start The starting index (inclusive)
    /// @arg[in] end The ending index (exclusive)
+   /// @arg[in] batch_size Maximum length of each kernel (0 for no limit)
    /// @arg[in] body The loop body to execute at each index
    ///
    ////////////////////////////////////////////////////////////////////////////////
    template <typename ExecutionPolicy, typename LB>
    void forall(ExecutionPolicy /* policy */, const char * fileName, const int lineNumber,
-               const int start, const int end, LB&& body) {
+               const int start, const int end, const int batch_size, LB&& body) {
       const int length = end - start;
 
       if (length != 0) {
          PluginData::setFileName(fileName);
          PluginData::setLineNumber(lineNumber);
 
+         int index = start ;
+         int chunk_size = batch_size > 0 ? batch_size : length ;
+
+         while (index < end) {
+            int chunk_start = index ;
+            int chunk_end = (index + chunk_size < end) ? index + chunk_size : end ;
 
 #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
-         RAJA::RangeStrideSegment rangeSegment =
-            s_reverseLoopOrder ?
-            RAJA::RangeStrideSegment(end - 1, start - 1, -1) :
-            RAJA::RangeStrideSegment(start, end, 1);
+            RAJA::RangeStrideSegment rangeSegment =
+               s_reverseLoopOrder ?
+               RAJA::RangeStrideSegment(chunk_end - 1, chunk_start - 1, -1) :
+               RAJA::RangeStrideSegment(chunk_start, chunk_end, 1);
 #else
-         RAJA::RangeSegment rangeSegment = RAJA::RangeSegment(start, end);
+            RAJA::RangeSegment rangeSegment = RAJA::RangeSegment(chunk_start, chunk_end);
 #endif
 
 #if CARE_ENABLE_GPU_SIMULATION_MODE
-         RAJA::forall<RAJA::seq_exec>(rangeSegment, std::forward<LB>(body));
+            chai::ArrayManager* threadRM = chai::ArrayManager::getInstance();
+            if (ExecutionPolicyToSpace<ExecutionPolicy>::value == chai::GPU) {
+               threadRM->setGPUSimMode(true);
+            }
+            else {
+               threadRM->setGPUSimMode(false);
+            }
+            RAJA::forall<RAJA::seq_exec>(rangeSegment, std::forward<LB>(body));
+            threadRM->setGPUSimMode(false);
 #else
-         RAJA::forall<ExecutionPolicy>(rangeSegment, std::forward<LB>(body));
+            RAJA::forall<ExecutionPolicy>(rangeSegment, std::forward<LB>(body));
 #endif
+
+            index += chunk_size ;
+         }
       }
    }
 
@@ -112,13 +130,14 @@ namespace care {
    /// @arg[in] lineNumber The line number in the file where this function is called
    /// @arg[in] start The starting index (inclusive)
    /// @arg[in] end The ending index (exclusive)
+   /// @arg[in] batch_size Maximum length of each kernel (0 for no limit)
    /// @arg[in] body The loop body to execute at each index
    ///
    ////////////////////////////////////////////////////////////////////////////////
    template <typename LB>
    void forall(sequential, const char * fileName, const int lineNumber,
-               const int start, const int end, LB&& body) {
-      forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, std::forward<LB>(body));
+               const int start, const int end, const int batch_size, LB&& body) {
+      forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
    }
 
    ////////////////////////////////////////////////////////////////////////////////
@@ -133,20 +152,21 @@ namespace care {
    /// @arg[in] lineNumber The line number in the file where this function is called
    /// @arg[in] start The starting index (inclusive)
    /// @arg[in] end The ending index (exclusive)
+   /// @arg[in] batch_size Maximum length of each kernel (0 for no limit)
    /// @arg[in] body The loop body to execute at each index
    ///
    ////////////////////////////////////////////////////////////////////////////////
    template <typename LB>
    void forall(openmp, const char * fileName, const int lineNumber,
-               const int start, const int end, LB&& body) {
+               const int start, const int end, const int batch_size, LB&& body) {
 #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
       s_reverseLoopOrder = true;
 #endif
 
 #if defined(_OPENMP) && defined(RAJA_ENABLE_OPENMP)
-      forall(RAJA::omp_parallel_for_exec{}, fileName, lineNumber, start, end, std::forward<LB>(body));
+      forall(RAJA::omp_parallel_for_exec{}, fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
 #else
-      forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, std::forward<LB>(body));
+      forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
 #endif
 
 #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
@@ -166,26 +186,27 @@ namespace care {
    /// @arg[in] lineNumber The line number in the file where this function is called
    /// @arg[in] start The starting index (inclusive)
    /// @arg[in] end The ending index (exclusive)
+   /// @arg[in] batch_size Maximum length of each kernel (0 for no limit)
    /// @arg[in] body The loop body to execute at each index
    ///
    ////////////////////////////////////////////////////////////////////////////////
    template <typename LB>
    void forall(gpu, const char * fileName, const int lineNumber,
-               const int start, const int end, LB&& body) {
+               const int start, const int end, const int batch_size, LB&& body) {
 #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
       s_reverseLoopOrder = true;
 #endif
 
 #if CARE_ENABLE_GPU_SIMULATION_MODE
-      forall(gpu_simulation{}, fileName, lineNumber, start, end, std::forward<LB>(body));
+      forall(gpu_simulation{}, fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
 #elif defined(__CUDACC__)
       forall(RAJA::cuda_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
-             fileName, lineNumber, start, end, std::forward<LB>(body));
+             fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
 #elif defined(__HIPCC__)
       forall(RAJA::hip_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
-             fileName, lineNumber, start, end, std::forward<LB>(body));
+             fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
 #else
-      forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, std::forward<LB>(body));
+      forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
 #endif
 
 #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
@@ -206,29 +227,30 @@ namespace care {
    /// @arg[in] lineNumber The line number in the file where this function is called
    /// @arg[in] start The starting index (inclusive)
    /// @arg[in] end The ending index (exclusive)
+   /// @arg[in] batch_size Maximum length of each kernel (0 for no limit)
    /// @arg[in] body The loop body to execute at each index
    ///
    ////////////////////////////////////////////////////////////////////////////////
    template <typename LB>
    void forall(parallel, const char * fileName, const int lineNumber,
-               const int start, const int end, LB&& body) {
+               const int start, const int end, const int batch_size, LB&& body) {
 #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
       s_reverseLoopOrder = true;
 #endif
       PluginData::setParallelContext(true);
       
 #if CARE_ENABLE_GPU_SIMULATION_MODE
-      forall(gpu_simulation{}, fileName, lineNumber, start, end, std::forward<LB>(body));
+      forall(gpu_simulation{}, fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
 #elif defined(__CUDACC__)
       forall(RAJA::cuda_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
-             fileName, lineNumber, start, end, std::forward<LB>(body));
+             fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
 #elif defined(__HIPCC__)
       forall(RAJA::hip_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
-             fileName, lineNumber, start, end, std::forward<LB>(body));
+             fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
 #elif defined(_OPENMP) && defined(RAJA_ENABLE_OPENMP)
-      forall(RAJA::omp_parallel_for_exec{}, fileName, lineNumber, start, end, std::forward<LB>(body));
+      forall(RAJA::omp_parallel_for_exec{}, fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
 #else
-      forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, std::forward<LB>(body));
+      forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
 #endif
       PluginData::setParallelContext(false);
 
@@ -243,23 +265,24 @@ namespace care {
    ///
    /// @brief Execute using the care::RAJAReductionExec policy
    ///
-   /// @arg[in] parallel_reducew Used to choose this overload of forall
+   /// @arg[in] parallel_reduce Used to choose this overload of forall
    /// @arg[in] fileName The name of the file where this function is called
    /// @arg[in] lineNumber The line number in the file where this function is called
    /// @arg[in] start The starting index (inclusive)
    /// @arg[in] end The ending index (exclusive)
+   /// @arg[in] batch_size Maximum length of each kernel (0 for no limit)
    /// @arg[in] body The loop body to execute at each index
    ///
    ////////////////////////////////////////////////////////////////////////////////
    template <typename LB>
    void forall(parallel_reduce, const char * fileName, const int lineNumber,
-               const int start, const int end, LB&& body) {
+               const int start, const int end, const int batch_size, LB&& body) {
 #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
       s_reverseLoopOrder = true;
 #endif
       PluginData::setParallelContext(true);
 
-      forall(RAJAReductionExec{}, fileName, lineNumber, start, end, std::forward<LB>(body));
+      forall(RAJAReductionExec{}, fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
 
       PluginData::setParallelContext(false);
 
@@ -282,24 +305,25 @@ namespace care {
    /// @arg[in] lineNumber The line number in the file where this function is called
    /// @arg[in] start The starting index (inclusive)
    /// @arg[in] end The ending index (exclusive)
+   /// @arg[in] batch_size Maximum length of each kernel (0 for no limit)
    /// @arg[in] body The loop body to execute at each index
    ///
    ////////////////////////////////////////////////////////////////////////////////
    template <typename LB>
    void forall(managed_ptr_read, const char * fileName, const int lineNumber,
-               const int start, const int end, LB&& body) {
+               const int start, const int end, const int batch_size, LB&& body) {
 #if CARE_ENABLE_GPU_SIMULATION_MODE && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU)
-      forall(gpu_simulation{}, fileName, lineNumber, start, end, std::forward<LB>(body));
+      forall(gpu_simulation{}, fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
 #elif defined(__CUDACC__) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU)
       forall(RAJA::cuda_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
-             fileName, lineNumber, start, end, std::forward<LB>(body));
+             fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
 #elif defined(__HIPCC__) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU)
       forall(RAJA::hip_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
-             fileName, lineNumber, start, end, std::forward<LB>(body));
+             fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
 #elif defined(_OPENMP) && defined(RAJA_ENABLE_OPENMP)
-      forall(RAJA::omp_parallel_for_exec{}, fileName, lineNumber, start, end, std::forward<LB>(body));
+      forall(RAJA::omp_parallel_for_exec{}, fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
 #else
-      forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, std::forward<LB>(body));
+      forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
 #endif
    }
 
@@ -433,9 +457,9 @@ namespace care {
    ////////////////////////////////////////////////////////////////////////////////
    template <typename LB>
    void forall(managed_ptr_write, const char * fileName, int lineNumber,
-               int start, const int end, LB body) {
+               const int start, const int end, const int batch_size, LB&& body) {
       // preLoopPrint and postLoopPrint are handled in this call.
-      forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, body);
+      forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, batch_size, body);
 
 #if defined(CARE_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU)
       const int length = end - start;
@@ -444,12 +468,23 @@ namespace care {
          chai::ArrayManager* threadRM = chai::ArrayManager::getInstance();
          threadRM->setExecutionSpace(chai::GPU);
 
+         int index = start ;
+         int chunk_size = batch_size > 0 ? batch_size : length ;
+
+         while (index < end) {
+            int chunk_start = index ;
+            int chunk_end = (index + chunk_size < end) ? index + chunk_size : end ;
+            RAJA::RangeSegment rangeSegment = RAJA::RangeSegment(chunk_start, chunk_end);
+
 #if defined(__CUDACC__)
-         RAJA::forall< RAJA::cuda_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>>(RAJA::RangeSegment(start, end), body);
+            RAJA::forall< RAJA::cuda_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>>(rangeSegment, body);
 #elif defined(__HIPCC__)
-         RAJA::forall< RAJA::hip_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>>(RAJA::RangeSegment(start, end), body);
+            RAJA::forall< RAJA::hip_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>>(rangeSegment, body);
 #endif
 
+            index += chunk_size ;
+         }
+
 #if FORCE_SYNC && defined(CARE_GPUCC)
          care::gpuDeviceSynchronize(fileName, lineNumber);
 #endif
@@ -471,30 +506,31 @@ namespace care {
    /// @arg[in] lineNumber The line number in the file where this function is called
    /// @arg[in] start The starting index (inclusive)
    /// @arg[in] end The ending index (exclusive)
+   /// @arg[in] batch_size Maximum length of each kernel (0 for no limit)
    /// @arg[in] body The loop body to execute at each index
    ///
    ////////////////////////////////////////////////////////////////////////////////
    template <typename LB>
    void forall(Policy&& policy, const char * fileName, const int lineNumber,
-               const int start, const int end, LB&& body) {
+               const int start, const int end, const int batch_size, LB&& body) {
       switch (policy) {
          case Policy::sequential:
-            forall(sequential{}, fileName, lineNumber, start, end, std::forward<LB>(body));
+            forall(sequential{}, fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
             break;
          case Policy::openmp:
-            forall(openmp{}, fileName, lineNumber, start, end, std::forward<LB>(body));
+            forall(openmp{}, fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
             break;
          case Policy::gpu:
-            forall(gpu{}, fileName, lineNumber, start, end, std::forward<LB>(body));
+            forall(gpu{}, fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
             break;
          case Policy::parallel:
-            forall(parallel{}, fileName, lineNumber, start, end, std::forward<LB>(body));
+            forall(parallel{}, fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
             break;
          case Policy::managed_ptr_read:
-            forall(managed_ptr_read{}, fileName, lineNumber, start, end, std::forward<LB>(body));
+            forall(managed_ptr_read{}, fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
             break;
          case Policy::managed_ptr_write:
-            forall(managed_ptr_write{}, fileName, lineNumber, start, end, std::forward<LB>(body));
+            forall(managed_ptr_write{}, fileName, lineNumber, start, end, batch_size, std::forward<LB>(body));
             break;
          default:
             std::cout << "[CARE] Error: Invalid policy!" << std::endl;
diff --git a/test/TestForall.cpp b/test/TestForall.cpp
index 2b10da77..8cadc3bc 100644
--- a/test/TestForall.cpp
+++ b/test/TestForall.cpp
@@ -56,6 +56,40 @@ CPU_TEST(forall, dynamic_policy)
    temp.free();
 }
 
+CPU_TEST(forall, chunked_static_policy)
+{
+   const int batch_size = 3;
+   const int length = 10;
+   care::host_device_ptr<int> temp(length, "temp");
+
+   CARE_CHUNKED_LOOP(care::sequential{}, i, 0, length, batch_size) {
+      temp[i] = i;
+   } CARE_CHUNKED_LOOP_END
+
+   CARE_SEQUENTIAL_LOOP(i, 0, length) {
+      EXPECT_EQ(temp[i], i);
+   } CARE_SEQUENTIAL_LOOP_END
+
+   temp.free();
+}
+
+CPU_TEST(forall, chunked_dynamic_policy)
+{
+   const int batch_size = 3;
+   const int length = 10;
+   care::host_device_ptr<int> temp(length, "temp");
+
+   CARE_LOOP(care::Policy::sequential, i, 0, length, batch_size) {
+      temp[i] = i;
+   } CARE_CHUNKED_LOOP_END
+
+   CARE_SEQUENTIAL_LOOP(i, 0, length) {
+      EXPECT_EQ(temp[i], i);
+   } CARE_SEQUENTIAL_LOOP_END
+
+   temp.free();
+}
+
 #if defined(CARE_ENABLE_RACE_DETECTION)
 CPU_TEST(forall, race_condition_detection)
 {
@@ -117,5 +151,40 @@ GPU_TEST(forall, dynamic_policy)
    temp.free();
 }
 
+GPU_TEST(forall, chunked_static_policy)
+{
+   const int batch_size = 3;
+   const int length = 10;
+   care::host_device_ptr<int> temp(length, "temp");
+
+   CARE_CHUNKED_LOOP(care::gpu{}, i, 0, length, batch_size) {
+      temp[i] = i;
+   } CARE_CHUNKED_LOOP_END
+
+   CARE_SEQUENTIAL_LOOP(i, 0, length) {
+      EXPECT_EQ(temp[i], i);
+   } CARE_SEQUENTIAL_LOOP_END
+
+   temp.free();
+}
+
+GPU_TEST(forall, chunked_dynamic_policy)
+{
+   const int batch_size = 3;
+   const int length = 10;
+   care::host_device_ptr<int> temp(length, "temp");
+
+   CARE_CHUNKED_LOOP(care::Policy::gpu, i, 0, length, batch_size) {
+      temp[i] = i;
+   } CARE_CHUNKED_LOOP_END
+
+   CARE_SEQUENTIAL_LOOP(i, 0, length) {
+      EXPECT_EQ(temp[i], i);
+   } CARE_SEQUENTIAL_LOOP_END
+
+   temp.free();
+}
+
+
 #endif // CARE_GPUCC