diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H
index e20f8bb4b79..2b5b4274766 100644
--- a/Src/Base/AMReX_FBI.H
+++ b/Src/Base/AMReX_FBI.H
@@ -53,7 +53,9 @@ void
 fab_to_fab (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp, int dcomp, int ncomp,
             F && f)
 {
-    detail::ParallelFor_doit(copy_tags,
+    TagVector<Array4CopyTag<T0, T1>> tv{copy_tags};
+
+    detail::ParallelFor_doit(tv,
         [=] AMREX_GPU_DEVICE (
 #ifdef AMREX_USE_SYCL
             sycl::nd_item<1> const& /*item*/,
@@ -85,7 +87,9 @@ fab_to_fab (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp, int dcom
 
     amrex::Abort("xxxxx TODO This function still has a bug.  Even if we fix the bug, it should still be avoided because it is slow due to the lack of atomic operations for this type.");
 
-    detail::ParallelFor_doit(tags,
+    TagVector<TagType> tv{tags};
+
+    detail::ParallelFor_doit(tv,
     [=] AMREX_GPU_DEVICE (
 #ifdef AMREX_USE_SYCL
         sycl::nd_item<1> const& item,
diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H
index 5cb640d2f13..67e80061580 100644
--- a/Src/Base/AMReX_FabArray.H
+++ b/Src/Base/AMReX_FabArray.H
@@ -1623,7 +1623,10 @@ FabArray<FAB>::build_arrays () const
 #ifdef AMREX_USE_GPU
         m_arrays.dp = (A*)m_dp_arrays;
         m_const_arrays.dp = (AC*)m_dp_arrays + n;
-        Gpu::htod_memcpy(m_dp_arrays, m_hp_arrays, n*2*sizeof(A));
+        Gpu::htod_memcpy_async(m_dp_arrays, m_hp_arrays, n*2*sizeof(A));
+        if (!Gpu::inNoSyncRegion()) {
+            Gpu::streamSynchronize();
+        }
 #endif
     }
 }
@@ -1633,9 +1636,12 @@ void
 FabArray<FAB>::clear_arrays ()
 {
 #ifdef AMREX_USE_GPU
-    The_Pinned_Arena()->free(m_hp_arrays);
-    The_Arena()->free(m_dp_arrays);
-    m_dp_arrays = nullptr;
+    if (m_dp_arrays) {
+        Gpu::streamSynchronize();
+        The_Pinned_Arena()->free(m_hp_arrays);
+        The_Arena()->free(m_dp_arrays);
+        m_dp_arrays = nullptr;
+    }
 #else
     std::free(m_hp_arrays);
 #endif
diff --git a/Src/Base/AMReX_GpuControl.cpp b/Src/Base/AMReX_GpuControl.cpp
index 4c862b417c5..ddc5c6714f8 100644
--- a/Src/Base/AMReX_GpuControl.cpp
+++ b/Src/Base/AMReX_GpuControl.cpp
@@ -6,8 +6,8 @@ namespace amrex::Gpu {
 #if defined(AMREX_USE_GPU)
 bool in_launch_region = true;
 bool in_graph_region = false;
-bool in_single_stream_region = false;
-bool in_nosync_region = false;
+bool in_single_stream_region = true;
+bool in_nosync_region = true;
 #endif
 
 }
diff --git a/Src/Base/AMReX_TagParallelFor.H b/Src/Base/AMReX_TagParallelFor.H
index 63bc127bba4..a4a4faadc52 100644
--- a/Src/Base/AMReX_TagParallelFor.H
+++ b/Src/Base/AMReX_TagParallelFor.H
@@ -101,31 +101,153 @@ struct VectorTag {
     Long size () const noexcept { return m_size; }
 };
 
-#ifdef AMREX_USE_GPU
-
 namespace detail {
 
-template <typename T>
-std::enable_if_t<std::is_same<std::decay_t<decltype(std::declval<T>().box())>, Box>::value,
-                 Long>
-get_tag_size (T const& tag) noexcept
-{
-    AMREX_ASSERT(tag.box().numPts() < Long(std::numeric_limits<int>::max()));
-    return static_cast<int>(tag.box().numPts());
-}
+    template <typename T>
+    std::enable_if_t<std::is_same_v<std::decay_t<decltype(std::declval<T>().box())>, Box>, Long>
+    get_tag_size (T const& tag) noexcept
+    {
+        AMREX_ASSERT(tag.box().numPts() < Long(std::numeric_limits<int>::max()));
+        return static_cast<int>(tag.box().numPts());
+    }
+
+    template <typename T>
+    std::enable_if_t<std::is_integral_v<std::decay_t<decltype(std::declval<T>().size())> >, Long>
+    get_tag_size (T const& tag) noexcept
+    {
+        AMREX_ASSERT(tag.size() < Long(std::numeric_limits<int>::max()));
+        return tag.size();
+    }
+
+    template <typename T>
+    constexpr
+    std::enable_if_t<std::is_same_v<std::decay_t<decltype(std::declval<T>().box())>, Box>, bool>
+    is_box_tag (T const&) { return true; }
+
+    template <typename T>
+    constexpr
+    std::enable_if_t<std::is_integral_v<std::decay_t<decltype(std::declval<T>().size())> >, bool>
+    is_box_tag (T const&) { return false; }
 
-template <typename T>
-std::enable_if_t<std::is_integral<std::decay_t<decltype(std::declval<T>().size())> >::value,
-                 Long>
-get_tag_size (T const& tag) noexcept
-{
-    AMREX_ASSERT(tag.size() < Long(std::numeric_limits<int>::max()));
-    return tag.size();
 }
 
+template <class TagType>
+struct TagVector {
+
+    char* h_buffer = nullptr;
+    char* d_buffer = nullptr;
+    TagType* d_tags =  nullptr;
+    int* d_nwarps = nullptr;
+    int ntags = 0;
+    int ntotwarps = 0;
+    int nblocks = 0;
+    bool defined = false;
+    static constexpr int nthreads = 256;
+
+    TagVector () = default;
+
+    TagVector (Vector<TagType> const& tags) {
+        define(tags);
+    }
+
+    ~TagVector () {
+        if (defined) {
+            undefine();
+        }
+    }
+
+    TagVector (const TagVector& other) = delete;
+    TagVector (TagVector&& other) = default;
+    TagVector& operator= (const TagVector& other) = delete;
+    TagVector& operator= (TagVector&& other) = default;
+
+    [[nodiscard]] bool is_defined () const { return defined; }
+
+    void define (Vector<TagType> const& tags) {
+        if (defined) {
+            undefine();
+        }
+
+        ntags = tags.size();
+        if (ntags == 0) {
+            defined = true;
+            return;
+        }
+
+#ifdef AMREX_USE_GPU
+        Long l_ntotwarps = 0;
+        ntotwarps = 0;
+        Vector<int> nwarps;
+        nwarps.reserve(ntags+1);
+        for (int i = 0; i < ntags; ++i)
+        {
+            auto& tag = tags[i];
+            nwarps.push_back(ntotwarps);
+            auto nw = (detail::get_tag_size(tag) + Gpu::Device::warp_size-1) /
+                Gpu::Device::warp_size;
+            l_ntotwarps += nw;
+            ntotwarps += static_cast<int>(nw);
+        }
+        nwarps.push_back(ntotwarps);
+
+        std::size_t sizeof_tags = ntags*sizeof(TagType);
+        std::size_t offset_nwarps = Arena::align(sizeof_tags);
+        std::size_t sizeof_nwarps = (ntags+1)*sizeof(int);
+        std::size_t total_buf_size = offset_nwarps + sizeof_nwarps;
+
+        h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size);
+        d_buffer = (char*)The_Arena()->alloc(total_buf_size);
+
+        std::memcpy(h_buffer, tags.data(), sizeof_tags);
+        std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps);
+        Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size);
+
+        d_tags = reinterpret_cast<TagType*>(d_buffer);
+        d_nwarps = reinterpret_cast<int*>(d_buffer+offset_nwarps);
+
+        constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size;
+        nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block;
+
+        defined = true;
+
+        amrex::ignore_unused(l_ntotwarps);
+        AMREX_ALWAYS_ASSERT(l_ntotwarps+nwarps_per_block-1 < Long(std::numeric_limits<int>::max()));
+#else
+        std::size_t sizeof_tags = ntags*sizeof(TagType);
+        h_buffer = (char*)The_Pinned_Arena()->alloc(sizeof_tags);
+
+        std::memcpy(h_buffer, tags.data(), sizeof_tags);
+
+        d_tags = reinterpret_cast<TagType*>(h_buffer);
+
+        defined = true;
+#endif
+    }
+
+    void undefine () {
+        if (defined) {
+            Gpu::streamSynchronize();
+            The_Pinned_Arena()->free(h_buffer);
+            The_Arena()->free(d_buffer);
+            h_buffer = nullptr;
+            d_buffer = nullptr;
+            d_tags = nullptr;
+            d_nwarps = nullptr;
+            ntags = 0;
+            ntotwarps = 0;
+            nblocks = 0;
+            defined = false;
+        }
+    }
+};
+
+namespace detail {
+
+#ifdef AMREX_USE_GPU
+
 template <typename T, typename F>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-std::enable_if_t<std::is_same<std::decay_t<decltype(std::declval<T>().box())>, Box>::value, void>
+std::enable_if_t<std::is_same_v<std::decay_t<decltype(std::declval<T>().box())>, Box>, void>
 tagparfor_call_f (
 #ifdef AMREX_USE_SYCL
     sycl::nd_item<1> const& item,
@@ -150,7 +272,7 @@ tagparfor_call_f (
 
 template <typename T, typename F>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-std::enable_if_t<std::is_integral<std::decay_t<decltype(std::declval<T>().size())> >::value, void>
+std::enable_if_t<std::is_integral_v<std::decay_t<decltype(std::declval<T>().size())> >, void>
 tagparfor_call_f (
 #ifdef AMREX_USE_SYCL
     sycl::nd_item<1> const& item,
@@ -167,48 +289,19 @@ tagparfor_call_f (
 
 template <class TagType, class F>
 void
-ParallelFor_doit (Vector<TagType> const& tags, F && f)
+ParallelFor_doit (TagVector<TagType> const& tv, F const& f)
 {
-    const int ntags = tags.size();
-    if (ntags == 0) { return; }
-
-    Long l_ntotwarps = 0;
-    int ntotwarps = 0;
-    Vector<int> nwarps;
-    nwarps.reserve(ntags+1);
-    for (int i = 0; i < ntags; ++i)
-    {
-        auto& tag = tags[i];
-        nwarps.push_back(ntotwarps);
-        auto nw = (get_tag_size(tag) + Gpu::Device::warp_size-1) / Gpu::Device::warp_size;
-        l_ntotwarps += nw;
-        ntotwarps += static_cast<int>(nw);
-    }
-    nwarps.push_back(ntotwarps);
+    AMREX_ALWAYS_ASSERT(tv.is_defined());
 
-    std::size_t sizeof_tags = ntags*sizeof(TagType);
-    std::size_t offset_nwarps = Arena::align(sizeof_tags);
-    std::size_t sizeof_nwarps = (ntags+1)*sizeof(int);
-    std::size_t total_buf_size = offset_nwarps + sizeof_nwarps;
+    if (tv.ntags == 0) { return; }
 
-    char* h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size);
-    char* d_buffer = (char*)The_Arena()->alloc(total_buf_size);
+    const auto d_tags = tv.d_tags;
+    const auto d_nwarps = tv.d_nwarps;
+    const auto ntags = tv.ntags;
+    const auto ntotwarps = tv.ntotwarps;
+    constexpr auto nthreads = TagVector<TagType>::nthreads;
 
-    std::memcpy(h_buffer, tags.data(), sizeof_tags);
-    std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps);
-    Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size);
-
-    auto d_tags = reinterpret_cast<TagType*>(d_buffer);
-    auto d_nwarps = reinterpret_cast<int*>(d_buffer+offset_nwarps);
-
-    constexpr int nthreads = 256;
-    constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size;
-    int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block;
-
-    amrex::ignore_unused(l_ntotwarps);
-    AMREX_ASSERT(l_ntotwarps+nwarps_per_block-1 < Long(std::numeric_limits<int>::max()));
-
-    amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
+    amrex::launch(tv.nblocks, nthreads, Gpu::gpuStream(),
 #ifdef AMREX_USE_SYCL
     [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
     [[sycl::reqd_work_group_size(nthreads)]]
@@ -241,20 +334,60 @@ ParallelFor_doit (Vector<TagType> const& tags, F && f)
         tagparfor_call_f(      icell, d_tags[tag_id], f);
 #endif
     });
+}
+
+#else // ifdef AMREX_USE_GPU
 
-    Gpu::streamSynchronize();
-    The_Pinned_Arena()->free(h_buffer);
-    The_Arena()->free(d_buffer);
+template <class TagType, class F>
+void
+ParallelFor_doit (TagVector<TagType> const& tv, F const& f)
+{
+    AMREX_ALWAYS_ASSERT(tv.is_defined());
+
+    if (tv.ntags == 0) { return; }
+
+    const auto d_tags = tv.d_tags;
+    const auto ntags = tv.ntags;
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel
+#endif
+    for (int itag = 0; itag < ntags; ++itag) {
+
+        const auto& t = d_tags[itag];
+
+        if constexpr (is_box_tag(t)) {
+            const auto lo = amrex::lbound(t.box());
+            const auto hi = amrex::ubound(t.box());
+
+            for (int k = lo.z; k <= hi.z; ++k) {
+                for (int j = lo.y; j <= hi.y; ++j) {
+                    AMREX_PRAGMA_SIMD
+                    for (int i = lo.x; i <= hi.x; ++i) {
+                        f(0, 1, i, j, k, t);
+                    }
+                }
+            }
+        } else {
+            const auto size = t.size();
+
+            AMREX_PRAGMA_SIMD
+            for (int i = 0; i < size; ++i) {
+                f(i, size, t);
+            }
+        }
+    }
 }
 
+#endif
+
 }
 
 template <class TagType, class F>
-std::enable_if_t<std::is_same<std::decay_t<decltype(std::declval<TagType>().box())>,
-                              Box>::value>
-ParallelFor (Vector<TagType> const& tags, int ncomp, F && f)
+std::enable_if_t<std::is_same_v<std::decay_t<decltype(std::declval<TagType>().box())>, Box>>
+ParallelFor (TagVector<TagType> const& tv, int ncomp, F const& f)
 {
-    detail::ParallelFor_doit(tags,
+    detail::ParallelFor_doit(tv,
         [=] AMREX_GPU_DEVICE (
 #ifdef AMREX_USE_SYCL
             sycl::nd_item<1> const& /*item*/,
@@ -270,10 +403,10 @@ ParallelFor (Vector<TagType> const& tags, int ncomp, F && f)
 }
 
 template <class TagType, class F>
-std::enable_if_t<std::is_same<std::decay_t<decltype(std::declval<TagType>().box())>, Box>::value, void>
-ParallelFor (Vector<TagType> const& tags, F && f)
+std::enable_if_t<std::is_same_v<std::decay_t<decltype(std::declval<TagType>().box())>, Box>, void>
+ParallelFor (TagVector<TagType> const& tv, F const& f)
 {
-    detail::ParallelFor_doit(tags,
+    detail::ParallelFor_doit(tv,
         [=] AMREX_GPU_DEVICE (
 #ifdef AMREX_USE_SYCL
             sycl::nd_item<1> const& /*item*/,
@@ -287,10 +420,10 @@ ParallelFor (Vector<TagType> const& tags, F && f)
 }
 
 template <class TagType, class F>
-std::enable_if_t<std::is_integral<std::decay_t<decltype(std::declval<TagType>().size())> >::value, void>
-ParallelFor (Vector<TagType> const& tags, F && f)
+std::enable_if_t<std::is_integral_v<std::decay_t<decltype(std::declval<TagType>().size())> >, void>
+ParallelFor (TagVector<TagType> const& tv, F const& f)
 {
-    detail::ParallelFor_doit(tags,
+    detail::ParallelFor_doit(tv,
         [=] AMREX_GPU_DEVICE (
 #ifdef AMREX_USE_SYCL
             sycl::nd_item<1> const& /*item*/,
@@ -303,7 +436,29 @@ ParallelFor (Vector<TagType> const& tags, F && f)
         });
 }
 
-#endif
+template <class TagType, class F>
+std::enable_if_t<std::is_same_v<std::decay_t<decltype(std::declval<TagType>().box())>, Box>>
+ParallelFor (Vector<TagType> const& tags, int ncomp, F && f)
+{
+    TagVector<TagType> tv{tags};
+    ParallelFor(tv, ncomp, std::forward<F>(f));
+}
+
+template <class TagType, class F>
+std::enable_if_t<std::is_same_v<std::decay_t<decltype(std::declval<TagType>().box())>, Box>, void>
+ParallelFor (Vector<TagType> const& tags, F && f)
+{
+    TagVector<TagType> tv{tags};
+    ParallelFor(tv, std::forward<F>(f));
+}
+
+template <class TagType, class F>
+std::enable_if_t<std::is_integral_v<std::decay_t<decltype(std::declval<TagType>().size())> >, void>
+ParallelFor (Vector<TagType> const& tags, F && f)
+{
+    TagVector<TagType> tv{tags};
+    ParallelFor(tv, std::forward<F>(f));
+}
 
 }
 
diff --git a/Src/Boundary/AMReX_BndryRegister.H b/Src/Boundary/AMReX_BndryRegister.H
index 7892218734f..12aa5dd2f93 100644
--- a/Src/Boundary/AMReX_BndryRegister.H
+++ b/Src/Boundary/AMReX_BndryRegister.H
@@ -267,9 +267,12 @@ BndryRegisterT<MF>::copyFrom (const MF& src, int nghost,
                               int src_comp, int dest_comp, int num_comp,
                               const Periodicity& period)
 {
-    for (OrientationIter face; face; ++face)
-    {
-        bndry[face()].copyFrom(src,nghost,src_comp,dest_comp,num_comp,period);
+    for (OrientationIter face; face; ++face) {
+        bndry[face()].multiFab().ParallelCopy_nowait(src,src_comp,dest_comp,num_comp,nghost,0,period);
+    }
+
+    for (OrientationIter face; face; ++face) {
+        bndry[face()].multiFab().ParallelCopy_finish();
     }
     return *this;
 }
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLap_3D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLABecLap_3D_K.H
index bb5172396cf..9bdbf9366ee 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLABecLap_3D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLap_3D_K.H
@@ -263,6 +263,83 @@ void abec_gsrb (int i, int j, int k, int n, Array4<T> const& phi, Array4<T const
     }
 }
 
+template <typename T>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void precompute_abec_gsrb (int i, int j, int k, int n,
+                Array4<T> const& pre_comp,
+                T alpha, Array4<T const> const& a,
+                T dhx, T dhy, T dhz,
+                Array4<T const> const& bX, Array4<T const> const& bY,
+                Array4<T const> const& bZ,
+                Array4<int const> const& m0, Array4<int const> const& m2,
+                Array4<int const> const& m4,
+                Array4<int const> const& m1, Array4<int const> const& m3,
+                Array4<int const> const& m5,
+                Array4<T const> const& f0, Array4<T const> const& f2,
+                Array4<T const> const& f4,
+                Array4<T const> const& f1, Array4<T const> const& f3,
+                Array4<T const> const& f5,
+                Box const& vbox) noexcept
+{
+    constexpr T omega = T(1.15);
+
+    const auto vlo = amrex::lbound(vbox);
+    const auto vhi = amrex::ubound(vbox);
+
+    T cf0 = (i == vlo.x && m0(vlo.x-1,j,k) > 0)
+        ? f0(vlo.x,j,k,n) : T(0.0);
+    T cf1 = (j == vlo.y && m1(i,vlo.y-1,k) > 0)
+        ? f1(i,vlo.y,k,n) : T(0.0);
+    T cf2 = (k == vlo.z && m2(i,j,vlo.z-1) > 0)
+        ? f2(i,j,vlo.z,n) : T(0.0);
+    T cf3 = (i == vhi.x && m3(vhi.x+1,j,k) > 0)
+        ? f3(vhi.x,j,k,n) : T(0.0);
+    T cf4 = (j == vhi.y && m4(i,vhi.y+1,k) > 0)
+        ? f4(i,vhi.y,k,n) : T(0.0);
+    T cf5 = (k == vhi.z && m5(i,j,vhi.z+1) > 0)
+        ? f5(i,j,vhi.z,n) : T(0.0);
+
+    T gamma = alpha*a(i,j,k)
+        +   dhx*(bX(i,j,k,n)+bX(i+1,j,k,n))
+        +   dhy*(bY(i,j,k,n)+bY(i,j+1,k,n))
+        +   dhz*(bZ(i,j,k,n)+bZ(i,j,k+1,n));
+
+    T g_m_d = gamma
+        - (dhx*(bX(i,j,k,n)*cf0 + bX(i+1,j,k,n)*cf3)
+        +  dhy*(bY(i,j,k,n)*cf1 + bY(i,j+1,k,n)*cf4)
+        +  dhz*(bZ(i,j,k,n)*cf2 + bZ(i,j,k+1,n)*cf5));
+
+    T omega_dif_g_m_d = omega / g_m_d;
+
+    pre_comp(i, j, k, 2*n) = gamma;
+    pre_comp(i, j, k, 2*n+1) = omega_dif_g_m_d;
+}
+
+template <typename T>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void abec_gsrb_v2 (int i, int j, int k, int n, Array4<T> const& phi, Array4<T const> const& rhs,
+                Array4<T const> const& pre_comp,
+                T dhx, T dhy, T dhz,
+                Array4<T const> const& bX, Array4<T const> const& bY,
+                Array4<T const> const& bZ, int redblack) noexcept
+{
+    if ((i+j+k+redblack)%2 == 0) {
+
+        T gamma = pre_comp(i,j,k,2*n);
+        T omega_dif_g_m_d = pre_comp(i,j,k,2*n+1);
+
+        T rho =  dhx*( bX(i  ,j,k,n)*phi(i-1,j,k,n)
+               +       bX(i+1,j,k,n)*phi(i+1,j,k,n) )
+               + dhy*( bY(i,j  ,k,n)*phi(i,j-1,k,n)
+               +       bY(i,j+1,k,n)*phi(i,j+1,k,n) )
+               + dhz*( bZ(i,j,k  ,n)*phi(i,j,k-1,n)
+               +       bZ(i,j,k+1,n)*phi(i,j,k+1,n) );
+
+        T res =  rhs(i,j,k,n) - (gamma*phi(i,j,k,n) - rho);
+        phi(i,j,k,n) = phi(i,j,k,n) + omega_dif_g_m_d * res;
+    }
+}
+
 template <typename T>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void abec_gsrb_os (int i, int j, int k, int n,
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H
index 5fcdb85eab6..b993a108279 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H
@@ -155,6 +155,7 @@ public:
     [[nodiscard]] bool isSingular (int amrlev) const override { return m_is_singular[amrlev]; }
     [[nodiscard]] bool isBottomSingular () const override { return m_is_singular[0]; }
     void Fapply (int amrlev, int mglev, MF& out, const MF& in) const final;
+    void PrecomputeSmoothCoeffs (int amrlev, int mglev);
     void Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, int redblack) const final;
     void FFlux (int amrlev, const MFIter& mfi,
                 const Array<FAB*,AMREX_SPACEDIM>& flux,
@@ -194,6 +195,7 @@ public:
     RT m_b_scalar = std::numeric_limits<RT>::quiet_NaN();
     Vector<Vector<MF> > m_a_coeffs;
     Vector<Vector<Array<MF,AMREX_SPACEDIM> > > m_b_coeffs;
+    Vector<Vector<MF> > m_pre_comp;
 
     bool m_scalars_set = false;
     bool m_acoef_set = false;
@@ -277,15 +279,22 @@ MLABecLaplacianT<MF>::define_ab_coeffs ()
 {
     m_a_coeffs.resize(this->m_num_amr_levels);
     m_b_coeffs.resize(this->m_num_amr_levels);
+    m_pre_comp.resize(this->m_num_amr_levels);
     for (int amrlev = 0; amrlev < this->m_num_amr_levels; ++amrlev)
     {
         m_a_coeffs[amrlev].resize(this->m_num_mg_levels[amrlev]);
         m_b_coeffs[amrlev].resize(this->m_num_mg_levels[amrlev]);
+        m_pre_comp[amrlev].resize(this->m_num_mg_levels[amrlev]);
         for (int mglev = 0; mglev < this->m_num_mg_levels[amrlev]; ++mglev)
         {
             m_a_coeffs[amrlev][mglev].define
                 (this->m_grids[amrlev][mglev], this->m_dmap[amrlev][mglev],
-                 1, 0, MFInfo(), *(this->m_factory[amrlev][mglev]));
+                1, 0, MFInfo(), *(this->m_factory[amrlev][mglev]));
+
+            const int pre_comp_nghost = 0;
+            m_pre_comp[amrlev][mglev].define
+                (this->m_grids[amrlev][mglev], this->m_dmap[amrlev][mglev],
+                2, pre_comp_nghost, MFInfo(), *(this->m_factory[amrlev][mglev]));
             for (int idim = 0; idim < AMREX_SPACEDIM; ++idim)
             {
                 const BoxArray& ba = amrex::convert(this->m_grids[amrlev][mglev],
@@ -436,6 +445,12 @@ MLABecLaplacianT<MF>::prepareForSolve ()
 
     update_singular_flags();
 
+    for (int amrlev = 0; amrlev < this->m_num_amr_levels; ++amrlev) {
+        for (int mglev = 0; mglev < this->m_num_mg_levels[amrlev]; ++mglev) {
+            PrecomputeSmoothCoeffs(amrlev, mglev);
+        }
+    }
+
     m_needs_update = false;
 }
 
@@ -841,7 +856,6 @@ MLABecLaplacianT<MF>::Fapply (int amrlev, int mglev, MF& out, const MF& in) cons
                                 dxinv, ascalar, bscalar);
             });
         }
-        Gpu::streamSynchronize();
     } else
 #endif
     {
@@ -875,6 +889,88 @@ MLABecLaplacianT<MF>::Fapply (int amrlev, int mglev, MF& out, const MF& in) cons
     }
 }
 
+template <typename MF>
+void
+MLABecLaplacianT<MF>::PrecomputeSmoothCoeffs ([[maybe_unused]] int amrlev, [[maybe_unused]] int mglev)
+{
+    BL_PROFILE("MLABecLaplacian::PrecomputeSmoothCoeffs()");
+
+#if (AMREX_SPACEDIM == 3) && defined(AMREX_USE_GPU)
+    bool regular_coarsening = true;
+    if (amrlev == 0 && mglev > 0) {
+        regular_coarsening = this->mg_coarsen_ratio_vec[mglev-1] == this->mg_coarsen_ratio;
+    }
+
+    if (!this->m_use_gauss_seidel || this->m_overset_mask[amrlev][mglev] ||
+        !Gpu::inLaunchRegion() || !regular_coarsening) {
+        return;
+    }
+
+    const MF& acoef = m_a_coeffs[amrlev][mglev];
+    AMREX_ALWAYS_ASSERT(acoef.nGrowVect() == 0);
+
+    const auto& undrrelxr = this->m_undrrelxr[amrlev][mglev];
+    const auto& maskvals  = this->m_maskvals [amrlev][mglev];
+
+    auto& pre_comp = this->m_pre_comp[amrlev][mglev];
+
+    const int nc = this->getNComp();
+    const Real* h = this->m_geom[amrlev][mglev].CellSize();
+    AMREX_D_TERM(const RT dhx = m_b_scalar/static_cast<RT>(h[0]*h[0]);,
+                 const RT dhy = m_b_scalar/static_cast<RT>(h[1]*h[1]);,
+                 const RT dhz = m_b_scalar/static_cast<RT>(h[2]*h[2]));
+    const RT alpha = m_a_scalar;
+
+    const auto& m0ma = maskvals[0].const_arrays();
+    const auto& m1ma = maskvals[1].const_arrays();
+#if (AMREX_SPACEDIM > 1)
+    const auto& m2ma = maskvals[2].const_arrays();
+    const auto& m3ma = maskvals[3].const_arrays();
+#if (AMREX_SPACEDIM > 2)
+    const auto& m4ma = maskvals[4].const_arrays();
+    const auto& m5ma = maskvals[5].const_arrays();
+#endif
+#endif
+
+    const auto& ama = acoef.const_arrays();
+
+    AMREX_D_TERM(const auto& bxma = m_b_coeffs[amrlev][mglev][0].const_arrays();,
+                 const auto& byma = m_b_coeffs[amrlev][mglev][1].const_arrays();,
+                 const auto& bzma = m_b_coeffs[amrlev][mglev][2].const_arrays(););
+
+    OrientationIter oitr;
+
+    const auto& f0ma = undrrelxr[oitr()].const_arrays(); ++oitr;
+    const auto& f1ma = undrrelxr[oitr()].const_arrays(); ++oitr;
+#if (AMREX_SPACEDIM > 1)
+    const auto& f2ma = undrrelxr[oitr()].const_arrays(); ++oitr;
+    const auto& f3ma = undrrelxr[oitr()].const_arrays(); ++oitr;
+#if (AMREX_SPACEDIM > 2)
+    const auto& f4ma = undrrelxr[oitr()].const_arrays(); ++oitr;
+    const auto& f5ma = undrrelxr[oitr()].const_arrays(); ++oitr;
+#endif
+#endif
+
+    const auto& pcma = pre_comp.arrays();
+
+    ParallelFor(acoef, IntVect(0), nc,
+        [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n) noexcept
+        {
+            Box vbx(ama[box_no]);
+            precompute_abec_gsrb(i,j,k,n, pcma[box_no], alpha, ama[box_no],
+                                 AMREX_D_DECL(dhx, dhy, dhz),
+                                 AMREX_D_DECL(bxma[box_no],byma[box_no],bzma[box_no]),
+                                 AMREX_D_DECL(m0ma[box_no],m2ma[box_no],m4ma[box_no]),
+                                 AMREX_D_DECL(m1ma[box_no],m3ma[box_no],m5ma[box_no]),
+                                 AMREX_D_DECL(f0ma[box_no],f2ma[box_no],f4ma[box_no]),
+                                 AMREX_D_DECL(f1ma[box_no],f3ma[box_no],f5ma[box_no]),
+                                 vbx);
+        });
+
+    pre_comp.FillBoundary();
+#endif
+}
+
 template <typename MF>
 void
 MLABecLaplacianT<MF>::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, int redblack) const
@@ -924,6 +1020,8 @@ MLABecLaplacianT<MF>::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, in
 #endif
 #endif
 
+    [[maybe_unused]] const auto& pre_comp = this->m_pre_comp[amrlev][mglev];
+
     const int nc = this->getNComp();
     const Real* h = this->m_geom[amrlev][mglev].CellSize();
     AMREX_D_TERM(const RT dhx = m_b_scalar/static_cast<RT>(h[0]*h[0]);,
@@ -965,6 +1063,8 @@ MLABecLaplacianT<MF>::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, in
 #endif
 #endif
 
+        const auto& pcma = pre_comp.const_arrays();
+
         if (this->m_overset_mask[amrlev][mglev]) {
             const auto& osmma = this->m_overset_mask[amrlev][mglev]->const_arrays();
             if (this->m_use_gauss_seidel) {
@@ -1003,6 +1103,12 @@ MLABecLaplacianT<MF>::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, in
                 ParallelFor(sol, IntVect(0), nc,
                 [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n) noexcept
                 {
+#if (AMREX_SPACEDIM == 3)
+                    abec_gsrb_v2(i,j,k,n, solnma[box_no], rhsma[box_no], pcma[box_no],
+                              AMREX_D_DECL(dhx, dhy, dhz),
+                              AMREX_D_DECL(bxma[box_no],byma[box_no],bzma[box_no]),
+                              redblack);
+#else
                     Box vbx(ama[box_no]);
                     abec_gsrb(i,j,k,n, solnma[box_no], rhsma[box_no], alpha, ama[box_no],
                               AMREX_D_DECL(dhx, dhy, dhz),
@@ -1012,6 +1118,7 @@ MLABecLaplacianT<MF>::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, in
                               AMREX_D_DECL(f0ma[box_no],f2ma[box_no],f4ma[box_no]),
                               AMREX_D_DECL(f1ma[box_no],f3ma[box_no],f5ma[box_no]),
                               vbx, redblack);
+#endif
                 });
             } else {
                 const auto& axma = Ax.const_arrays();
@@ -1031,7 +1138,6 @@ MLABecLaplacianT<MF>::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, in
                 });
             }
         }
-        Gpu::streamSynchronize();
     } else
 #endif
     {
@@ -1271,7 +1377,6 @@ MLABecLaplacianT<MF>::normalize (int amrlev, int mglev, MF& mf) const
                                 AMREX_D_DECL(bxma[box_no],byma[box_no],bzma[box_no]),
                                 dxinv, ascalar, bscalar);
         });
-        Gpu::streamSynchronize();
     } else
 #endif
     {
@@ -1383,7 +1488,6 @@ MLABecLaplacianT<MF>::makeNLinOp (int /*grid_size*/) const
                 ama[box_no](i,j,k,n) = huge_alpha;
             }
         });
-        Gpu::streamSynchronize();
     } else
 #endif
     {
@@ -1434,7 +1538,6 @@ MLABecLaplacianT<MF>::copyNSolveSolution (MF& dst, MF const& src) const
                 dstma[box_no](i,j,k,n) = RT(0.0);
             }
         });
-        Gpu::streamSynchronize();
     } else
 #endif
     {
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
index 4c21c7f1a47..e7d5f2659b3 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
@@ -15,6 +15,9 @@
 
 namespace amrex {
 
+template <typename T>
+struct MLMGABCTag;
+
 template <typename MF>
 class MLCellLinOpT  // NOLINT(cppcoreguidelines-virtual-class-destructor)
     : public MLLinOpT<MF>
@@ -83,7 +86,7 @@ public:
     void apply (int amrlev, int mglev, MF& out, MF& in, BCMode bc_mode,
                         StateMode s_mode, const MLMGBndryT<MF>* bndry=nullptr) const override;
     void smooth (int amrlev, int mglev, MF& sol, const MF& rhs,
-                         bool skip_fillboundary=false) const final;
+                         bool skip_fillboundary=false, int niter=1) const final;
 
     void solutionResidual (int amrlev, MF& resid, MF& x, const MF& b,
                                    const MF* crse_bcdata=nullptr) override;
@@ -220,6 +223,9 @@ private:
     void computeVolInv () const;
     mutable Vector<Vector<RT> > m_volinv; // used by solvability fix
 
+    mutable TagVector<MLMGABCTag<RT>> m_bc_tag_vector;
+    mutable bool m_reuse_bc_tag_vector = false;
+
     int m_interpbndry_halfwidth = 2;
 };
 
@@ -728,37 +734,41 @@ MLCellLinOpT<MF>::applyBC (int amrlev, int mglev, MF& in, BCMode bc_mode, StateM
 #ifdef AMREX_USE_GPU
     if ((cross || tensorop) && Gpu::inLaunchRegion())
     {
-        Vector<MLMGABCTag<RT>> tags;
-        tags.reserve(in.local_size()*AMREX_SPACEDIM*ncomp);
 
-        for (MFIter mfi(in); mfi.isValid(); ++mfi) {
-            const Box& vbx = mfi.validbox();
-            const auto& iofab = in.array(mfi);
-            const auto & bdlv = bcondloc.bndryLocs(mfi);
-            const auto & bdcv = bcondloc.bndryConds(mfi);
+        if (!m_reuse_bc_tag_vector || !m_bc_tag_vector.is_defined()) {
+            Vector<MLMGABCTag<RT>> tags;
+            tags.reserve(in.local_size()*AMREX_SPACEDIM*ncomp);
 
-            for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
-                if (idim != hidden_direction) {
-                    const Orientation olo(idim,Orientation::low);
-                    const Orientation ohi(idim,Orientation::high);
-                    const auto& bvlo = (bndry != nullptr) ?
-                        bndry->bndryValues(olo).const_array(mfi) : foo;
-                    const auto& bvhi = (bndry != nullptr) ?
-                        bndry->bndryValues(ohi).const_array(mfi) : foo;
-                    for (int icomp = 0; icomp < ncomp; ++icomp) {
-                        tags.emplace_back(MLMGABCTag<RT>{iofab, bvlo, bvhi,
-                                                 maskvals[olo].const_array(mfi),
-                                                 maskvals[ohi].const_array(mfi),
-                                                 bdlv[icomp][olo], bdlv[icomp][ohi],
-                                                 amrex::adjCell(vbx,olo),
-                                                 bdcv[icomp][olo], bdcv[icomp][ohi],
-                                                 vbx.length(idim), icomp, idim});
+            for (MFIter mfi(in); mfi.isValid(); ++mfi) {
+                const Box& vbx = mfi.validbox();
+                const auto& iofab = in.array(mfi);
+                const auto & bdlv = bcondloc.bndryLocs(mfi);
+                const auto & bdcv = bcondloc.bndryConds(mfi);
+
+                for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+                    if (idim != hidden_direction) {
+                        const Orientation olo(idim,Orientation::low);
+                        const Orientation ohi(idim,Orientation::high);
+                        const auto& bvlo = (bndry != nullptr) ?
+                            bndry->bndryValues(olo).const_array(mfi) : foo;
+                        const auto& bvhi = (bndry != nullptr) ?
+                            bndry->bndryValues(ohi).const_array(mfi) : foo;
+                        for (int icomp = 0; icomp < ncomp; ++icomp) {
+                            tags.emplace_back(MLMGABCTag<RT>{iofab, bvlo, bvhi,
+                                                    maskvals[olo].const_array(mfi),
+                                                    maskvals[ohi].const_array(mfi),
+                                                    bdlv[icomp][olo], bdlv[icomp][ohi],
+                                                    amrex::adjCell(vbx,olo),
+                                                    bdcv[icomp][olo], bdcv[icomp][ohi],
+                                                    vbx.length(idim), icomp, idim});
+                        }
                     }
                 }
             }
+            m_bc_tag_vector.define(tags);
         }
 
-        ParallelFor(tags,
+        ParallelFor(m_bc_tag_vector,
         [=] AMREX_GPU_DEVICE (int i, int j, int k, MLMGABCTag<RT> const& tag) noexcept
         {
             if (tag.dir == 0)
@@ -795,6 +805,11 @@ MLCellLinOpT<MF>::applyBC (int amrlev, int mglev, MF& in, BCMode bc_mode, StateM
 #endif
 #endif
         });
+
+        if (!m_reuse_bc_tag_vector) {
+            m_bc_tag_vector.undefine();
+        }
+
     } else
 #endif
     if (cross || tensorop)
@@ -1211,16 +1226,37 @@ MLCellLinOpT<MF>::apply (int amrlev, int mglev, MF& out, MF& in, BCMode bc_mode,
 template <typename MF>
 void
 MLCellLinOpT<MF>::smooth (int amrlev, int mglev, MF& sol, const MF& rhs,
-                          bool skip_fillboundary) const
+                          bool skip_fillboundary, int niter) const
 {
     BL_PROFILE("MLCellLinOp::smooth()");
-    for (int redblack = 0; redblack < 2; ++redblack)
-    {
-        applyBC(amrlev, mglev, sol, BCMode::Homogeneous, StateMode::Solution,
-                nullptr, skip_fillboundary);
-        Fsmooth(amrlev, mglev, sol, rhs, redblack);
-        skip_fillboundary = false;
+
+    m_reuse_bc_tag_vector = true;
+
+    int ghost_cells_valid = skip_fillboundary ? 1 : 0;
+
+    for (int i = 0; i < niter; ++i) {
+        for (int redblack = 0; redblack < 2; ++redblack)
+        {
+            if (ghost_cells_valid == 0) {
+                const int ncomp = this->getNComp();
+                const int cross = false; //isCrossStencil();
+                const int ngrow = sol.nGrowVect().min();
+                //std::cout << "ncomp " << ncomp << " ngrow " << ngrow << " cross " << cross << std::endl;
+                sol.FillBoundary(0, ncomp, IntVect(ngrow), this->m_geom[amrlev][mglev].periodicity(), cross);
+                ghost_cells_valid = ngrow;
+                //sol.nGrowVect().min();
+            }
+
+            applyBC(amrlev, mglev, sol, BCMode::Homogeneous, StateMode::Solution,
+                    nullptr, true);
+            Fsmooth(amrlev, mglev, sol, rhs, redblack);
+
+            --ghost_cells_valid;
+        }
     }
+
+    m_reuse_bc_tag_vector = false;
+    m_bc_tag_vector.undefine();
 }
 
 template <typename MF>
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.H b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.H
index 1d6aa224017..2ef9f8d7ca3 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.H
@@ -73,7 +73,7 @@ public:
                 StateMode s_mode, const MLMGBndryT<MF>* bndry=nullptr) const override;
 
     void smooth (int amrlev, int mglev, MF& sol, const MF& rhs,
-                 bool skip_fillboundary=false) const override;
+                 bool skip_fillboundary=false, int niter=1) const override;
 
     void solutionResidual (int amrlev, MF& resid, MF& x, const MF& b,
                            const MF* crse_bcdata=nullptr) override;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.cpp b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.cpp
index 3e500351c91..e38540d0176 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.cpp
@@ -319,7 +319,7 @@ MLCurlCurl::apply (int amrlev, int mglev, MF& out, MF& in, BCMode /*bc_mode*/,
 }
 
 void MLCurlCurl::smooth (int amrlev, int mglev, MF& sol, const MF& rhs,
-                         bool skip_fillboundary) const
+                         bool skip_fillboundary, int niter) const
 {
     AMREX_ASSERT(rhs[0].nGrowVect().allGE(1));
 
@@ -331,16 +331,18 @@ void MLCurlCurl::smooth (int amrlev, int mglev, MF& sol, const MF& rhs,
     int ncolors = 4;
 #endif
 
-    for (int color = 0; color < ncolors; ++color) {
-        if (!skip_fillboundary) {
-            applyBC(amrlev, mglev, sol, CurlCurlStateType::x);
-        }
-        skip_fillboundary = false;
+    for (int i = 0; i < niter; ++i) {
+        for (int color = 0; color < ncolors; ++color) {
+            if (!skip_fillboundary) {
+                applyBC(amrlev, mglev, sol, CurlCurlStateType::x);
+            }
+            skip_fillboundary = false;
 #if (AMREX_SPACEDIM == 1)
-        smooth1D(amrlev, mglev, sol, rhs, color);
+            smooth1D(amrlev, mglev, sol, rhs, color);
 #else
-        smooth4(amrlev, mglev, sol, rhs, color);
+            smooth4(amrlev, mglev, sol, rhs, color);
 #endif
+        }
     }
 }
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
index 61580e4d368..53549cbec40 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
@@ -302,8 +302,6 @@ MLEBNodeFDLaplacian::prepareForSolve ()
 
     AMREX_ASSERT(!isBottomSingular());
 
-    Gpu::streamSynchronize();
-
 #if (AMREX_SPACEDIM == 2)
     if (m_rz) {
         if (m_geom[0][0].ProbLo(0) == 0._rt) {
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
index 08fa588b4a0..ccc017c6c26 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
@@ -355,9 +355,10 @@ public:
      * \param sol               unknowns
      * \param rhs               RHS
      * \param skip_fillboundary flag controlling whether ghost cell filling can be skipped.
+     * \param niter             number of (multicolor) smoothing iterations
      */
     virtual void smooth (int amrlev, int mglev, MF& sol, const MF& rhs,
-                         bool skip_fillboundary=false) const = 0;
+                         bool skip_fillboundary=false, int niter=1) const = 0;
 
     //! Divide mf by the diagonal component of the operator. Used by bicgstab.
     virtual void normalize (int amrlev, int mglev, MF& mf) const {
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLMG.H b/Src/LinearSolvers/MLMG/AMReX_MLMG.H
index 4b666f2c03d..01a1de73f59 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLMG.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLMG.H
@@ -1317,10 +1317,7 @@ MLMGT<MF>::mgVcycle (int amrlev, int mglev_top)
 
         setVal(cor[amrlev][mglev], RT(0.0));
         bool skip_fillboundary = true;
-        for (int i = 0; i < nu1; ++i) {
-            linop.smooth(amrlev, mglev, cor[amrlev][mglev], res[amrlev][mglev], skip_fillboundary);
-            skip_fillboundary = false;
-        }
+        linop.smooth(amrlev, mglev, cor[amrlev][mglev], res[amrlev][mglev], skip_fillboundary, nu1);
 
         // rescor = res - L(cor)
         computeResOfCorrection(amrlev, mglev);
@@ -1364,11 +1361,8 @@ MLMGT<MF>::mgVcycle (int amrlev, int mglev_top)
         }
         setVal(cor[amrlev][mglev_bottom], RT(0.0));
         bool skip_fillboundary = true;
-        for (int i = 0; i < nu1; ++i) {
-            linop.smooth(amrlev, mglev_bottom, cor[amrlev][mglev_bottom],
-                         res[amrlev][mglev_bottom], skip_fillboundary);
-            skip_fillboundary = false;
-        }
+        linop.smooth(amrlev, mglev_bottom, cor[amrlev][mglev_bottom],
+            res[amrlev][mglev_bottom], skip_fillboundary, nu1);
         if (verbose >= 4)
         {
             computeResOfCorrection(amrlev, mglev_bottom);
@@ -1391,9 +1385,7 @@ MLMGT<MF>::mgVcycle (int amrlev, int mglev_top)
             amrex::Print() << print_ident << "AT LEVEL "  << amrlev << " " << mglev
                            << "   UP: Norm before smooth " << norm << "\n";
         }
-        for (int i = 0; i < nu2; ++i) {
-            linop.smooth(amrlev, mglev, cor[amrlev][mglev], res[amrlev][mglev]);
-        }
+        linop.smooth(amrlev, mglev, cor[amrlev][mglev], res[amrlev][mglev], false, nu2);
 
         if (cf_strategy == CFStrategy::ghostnodes) { computeResOfCorrection(amrlev, mglev); }
 
@@ -1512,10 +1504,7 @@ MLMGT<MF>::actualBottomSolve ()
     if (bottom_solver == BottomSolver::smoother)
     {
         bool skip_fillboundary = true;
-        for (int i = 0; i < nuf; ++i) {
-            linop.smooth(amrlev, mglev, x, b, skip_fillboundary);
-            skip_fillboundary = false;
-        }
+        linop.smooth(amrlev, mglev, x, b, skip_fillboundary, nuf);
     }
     else
     {
@@ -1589,9 +1578,7 @@ MLMGT<MF>::actualBottomSolve ()
                 setVal(cor[amrlev][mglev], RT(0.0));
             }
             const int n = (ret==0) ? nub : nuf;
-            for (int i = 0; i < n; ++i) {
-                linop.smooth(amrlev, mglev, x, b);
-            }
+            linop.smooth(amrlev, mglev, x, b, false, n);
         }
     }
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLaplacian.cpp
index 0fda51eac14..7ead526ee15 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLaplacian.cpp
@@ -109,7 +109,6 @@ MLNodeABecLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFa
         yarr_ma[box_no](i,j,k) = (dmskarr_ma[box_no](i,j,k)) ? Real(0.0)
             : alpha*acoef_ma[box_no](i,j,k)*xarr_ma[box_no](i,j,k) - beta*lap;
     });
-    Gpu::streamSynchronize();
 }
 
 void
@@ -145,7 +144,6 @@ MLNodeABecLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiF
                                   acoef_ma[box_no], bcoef_ma[box_no],
                                   dmskarr_ma[box_no], dxinvarr);
         });
-        Gpu::streamSynchronize();
         if (m_smooth_num_sweeps > 1) { nodalSync(amrlev, mglev, sol); }
     }
 #else
@@ -193,7 +191,6 @@ MLNodeABecLaplacian::restriction (int amrlev, int cmglev, MultiFab& crse, MultiF
     {
         mlndlap_restriction(i,j,k,pcrse_ma[box_no],fine_ma[box_no],msk_ma[box_no]);
     });
-    Gpu::streamSynchronize();
 
     if (need_parallel_copy) {
         crse.ParallelCopy(cfine);
@@ -225,7 +222,6 @@ MLNodeABecLaplacian::interpolation (int amrlev, int fmglev, MultiFab& fine, cons
         mlndlap_interpadd_aa(i, j, k, fine_ma[box_no], crse_ma[box_no],
                              sig_ma[box_no], msk_ma[box_no]);
     });
-    Gpu::streamSynchronize();
 }
 
 void
@@ -280,7 +276,6 @@ MLNodeABecLaplacian::fixUpResidualMask (int amrlev, iMultiFab& resmsk)
     {
         if (fmsk[bno](i,j,k) == nodelap_detail::crse_fine_node) { rmsk[bno](i,j,k) = 1; }
     });
-    Gpu::streamSynchronize();
 }
 
 void
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp
index 8e490f30348..d6632e0ee0e 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp
@@ -265,7 +265,6 @@ MLNodeLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& i
 #endif
             });
         }
-        Gpu::streamSynchronize();
     } else
 #endif
     {
@@ -558,7 +557,6 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab&
             }
         }
 
-        Gpu::streamSynchronize();
         nodalSync(amrlev, mglev, sol);
     }
     else
@@ -700,8 +698,6 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab&
                 }
             }
         }
-
-        Gpu::streamSynchronize();
     }
 }
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H
index a9c1a65b525..8833974de03 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H
@@ -45,7 +45,7 @@ public:
                         StateMode s_mode, const MLMGBndry* bndry=nullptr) const final;
 
     void smooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& rhs,
-                         bool skip_fillboundary=false) const override;
+                         bool skip_fillboundary=false, int niter=1) const override;
 
     void solutionResidual (int amrlev, MultiFab& resid, MultiFab& x, const MultiFab& b,
                                    const MultiFab* crse_bcdata=nullptr) override;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp
index 9719ac3d3e8..a8d2453fd71 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp
@@ -167,12 +167,15 @@ MLNodeLinOp::apply (int amrlev, int mglev, MultiFab& out, MultiFab& in, BCMode b
 
 void
 MLNodeLinOp::smooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& rhs,
-                     bool skip_fillboundary) const
+                     bool skip_fillboundary, int niter) const
 {
-    if (!skip_fillboundary) {
-        applyBC(amrlev, mglev, sol, BCMode::Homogeneous, StateMode::Correction);
+    for (int i = 0; i < niter; ++i) {
+        if (!skip_fillboundary) {
+            applyBC(amrlev, mglev, sol, BCMode::Homogeneous, StateMode::Correction);
+        }
+        Fsmooth(amrlev, mglev, sol, rhs);
+        skip_fillboundary = false;
     }
-    Fsmooth(amrlev, mglev, sol, rhs);
 }
 
 Real
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.H
index 03a73cae86d..2f9b34a030b 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.H
@@ -58,7 +58,7 @@ public:
                  MultiFab& fine_res, MultiFab& fine_sol, const MultiFab& fine_rhs) const final;
 
     void smooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& rhs,
-                 bool skip_fillboundary=false) const final;
+                 bool skip_fillboundary=false, int niter=1) const final;
 
     void prepareForSolve () final;
     void Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& in) const final;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp
index e8135aeba14..1833c8e566a 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp
@@ -219,24 +219,25 @@ MLNodeTensorLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const Multi
     {
         mlndtslap_adotx(i,j,k, out_a[box_no], in_a[box_no], dmsk_a[box_no], s);
     });
-    Gpu::streamSynchronize();
 #endif
 }
 
 void
 MLNodeTensorLaplacian::smooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& rhs,
-                               bool skip_fillboundary) const
+                               bool skip_fillboundary, int niter) const
 {
     BL_PROFILE("MLNodeTensorLaplacian::smooth()");
-    for (int redblack = 0; redblack < 4; ++redblack) {
-        if (!skip_fillboundary) {
-            applyBC(amrlev, mglev, sol, BCMode::Homogeneous, StateMode::Correction);
+    for (int i = 0; i < niter; ++i) {
+        for (int redblack = 0; redblack < 4; ++redblack) {
+            if (!skip_fillboundary) {
+                applyBC(amrlev, mglev, sol, BCMode::Homogeneous, StateMode::Correction);
+            }
+            m_redblack = redblack;
+            Fsmooth(amrlev, mglev, sol, rhs);
+            skip_fillboundary = false;
         }
-        m_redblack = redblack;
-        Fsmooth(amrlev, mglev, sol, rhs);
-        skip_fillboundary = false;
+        nodalSync(amrlev, mglev, sol);
     }
-    nodalSync(amrlev, mglev, sol);
 }
 
 void
@@ -261,7 +262,6 @@ MLNodeTensorLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const Mult
             mlndtslap_gauss_seidel(i, j, k, sol_a[box_no], rhs_a[box_no], dmsk_a[box_no], s);
         }
     });
-    Gpu::streamSynchronize();
 #endif
 }
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H
index 679a1c9e583..7e693ad9b64 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H
@@ -235,7 +235,6 @@ MLPoissonT<MF>::Fapply (int amrlev, int mglev, MF& out, const MF& in) const
                 });
             }
         }
-        Gpu::streamSynchronize();
     } else
 #endif
     {
@@ -333,7 +332,6 @@ MLPoissonT<MF>::normalize (int amrlev, int mglev, MF& mf) const
         {
             mlpoisson_normalize(i,j,k, ma[box_no], AMREX_D_DECL(dhx,dhy,dhz), dx, probxlo);
         });
-        Gpu::streamSynchronize();
     } else
 #endif
     {