diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H index e20f8bb4b79..2b5b4274766 100644 --- a/Src/Base/AMReX_FBI.H +++ b/Src/Base/AMReX_FBI.H @@ -53,7 +53,9 @@ void fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, int ncomp, F && f) { - detail::ParallelFor_doit(copy_tags, + TagVector> tv{copy_tags}; + + detail::ParallelFor_doit(tv, [=] AMREX_GPU_DEVICE ( #ifdef AMREX_USE_SYCL sycl::nd_item<1> const& /*item*/, @@ -85,7 +87,9 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcom amrex::Abort("xxxxx TODO This function still has a bug. Even if we fix the bug, it should still be avoided because it is slow due to the lack of atomic operations for this type."); - detail::ParallelFor_doit(tags, + TagVector tv{tags}; + + detail::ParallelFor_doit(tv, [=] AMREX_GPU_DEVICE ( #ifdef AMREX_USE_SYCL sycl::nd_item<1> const& item, diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H index 5cb640d2f13..67e80061580 100644 --- a/Src/Base/AMReX_FabArray.H +++ b/Src/Base/AMReX_FabArray.H @@ -1623,7 +1623,10 @@ FabArray::build_arrays () const #ifdef AMREX_USE_GPU m_arrays.dp = (A*)m_dp_arrays; m_const_arrays.dp = (AC*)m_dp_arrays + n; - Gpu::htod_memcpy(m_dp_arrays, m_hp_arrays, n*2*sizeof(A)); + Gpu::htod_memcpy_async(m_dp_arrays, m_hp_arrays, n*2*sizeof(A)); + if (!Gpu::inNoSyncRegion()) { + Gpu::streamSynchronize(); + } #endif } } @@ -1633,9 +1636,12 @@ void FabArray::clear_arrays () { #ifdef AMREX_USE_GPU - The_Pinned_Arena()->free(m_hp_arrays); - The_Arena()->free(m_dp_arrays); - m_dp_arrays = nullptr; + if (m_dp_arrays) { + Gpu::streamSynchronize(); + The_Pinned_Arena()->free(m_hp_arrays); + The_Arena()->free(m_dp_arrays); + m_dp_arrays = nullptr; + } #else std::free(m_hp_arrays); #endif diff --git a/Src/Base/AMReX_GpuControl.cpp b/Src/Base/AMReX_GpuControl.cpp index 4c862b417c5..ddc5c6714f8 100644 --- a/Src/Base/AMReX_GpuControl.cpp +++ b/Src/Base/AMReX_GpuControl.cpp @@ -6,8 +6,8 @@ namespace amrex::Gpu { #if defined(AMREX_USE_GPU) bool in_launch_region = true; bool in_graph_region = false; -bool in_single_stream_region = false; -bool in_nosync_region = false; +bool in_single_stream_region = true; +bool in_nosync_region = true; #endif } diff --git a/Src/Base/AMReX_TagParallelFor.H b/Src/Base/AMReX_TagParallelFor.H index 63bc127bba4..a4a4faadc52 100644 --- a/Src/Base/AMReX_TagParallelFor.H +++ b/Src/Base/AMReX_TagParallelFor.H @@ -101,31 +101,153 @@ struct VectorTag { Long size () const noexcept { return m_size; } }; -#ifdef AMREX_USE_GPU - namespace detail { -template -std::enable_if_t().box())>, Box>::value, - Long> -get_tag_size (T const& tag) noexcept -{ - AMREX_ASSERT(tag.box().numPts() < Long(std::numeric_limits::max())); - return static_cast(tag.box().numPts()); -} + template + std::enable_if_t().box())>, Box>, Long> + get_tag_size (T const& tag) noexcept + { + AMREX_ASSERT(tag.box().numPts() < Long(std::numeric_limits::max())); + return static_cast(tag.box().numPts()); + } + + template + std::enable_if_t().size())> >, Long> + get_tag_size (T const& tag) noexcept + { + AMREX_ASSERT(tag.size() < Long(std::numeric_limits::max())); + return tag.size(); + } + + template + constexpr + std::enable_if_t().box())>, Box>, bool> + is_box_tag (T const&) { return true; } + + template + constexpr + std::enable_if_t().size())> >, bool> + is_box_tag (T const&) { return false; } -template -std::enable_if_t().size())> >::value, - Long> -get_tag_size (T const& tag) noexcept -{ - AMREX_ASSERT(tag.size() < Long(std::numeric_limits::max())); - return tag.size(); } +template +struct TagVector { + + char* h_buffer = nullptr; + char* d_buffer = nullptr; + TagType* d_tags = nullptr; + int* d_nwarps = nullptr; + int ntags = 0; + int ntotwarps = 0; + int nblocks = 0; + bool defined = false; + static constexpr int nthreads = 256; + + TagVector () = default; + + TagVector (Vector const& tags) { + define(tags); + } + + ~TagVector () { + if (defined) { + undefine(); + } + } + + TagVector (const TagVector& other) = delete; + TagVector (TagVector&& other) = default; + TagVector& operator= (const TagVector& other) = delete; + TagVector& operator= (TagVector&& other) = default; + + [[nodiscard]] bool is_defined () const { return defined; } + + void define (Vector const& tags) { + if (defined) { + undefine(); + } + + ntags = tags.size(); + if (ntags == 0) { + defined = true; + return; + } + +#ifdef AMREX_USE_GPU + Long l_ntotwarps = 0; + ntotwarps = 0; + Vector nwarps; + nwarps.reserve(ntags+1); + for (int i = 0; i < ntags; ++i) + { + auto& tag = tags[i]; + nwarps.push_back(ntotwarps); + auto nw = (detail::get_tag_size(tag) + Gpu::Device::warp_size-1) / + Gpu::Device::warp_size; + l_ntotwarps += nw; + ntotwarps += static_cast(nw); + } + nwarps.push_back(ntotwarps); + + std::size_t sizeof_tags = ntags*sizeof(TagType); + std::size_t offset_nwarps = Arena::align(sizeof_tags); + std::size_t sizeof_nwarps = (ntags+1)*sizeof(int); + std::size_t total_buf_size = offset_nwarps + sizeof_nwarps; + + h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size); + d_buffer = (char*)The_Arena()->alloc(total_buf_size); + + std::memcpy(h_buffer, tags.data(), sizeof_tags); + std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps); + Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size); + + d_tags = reinterpret_cast(d_buffer); + d_nwarps = reinterpret_cast(d_buffer+offset_nwarps); + + constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size; + nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block; + + defined = true; + + amrex::ignore_unused(l_ntotwarps); + AMREX_ALWAYS_ASSERT(l_ntotwarps+nwarps_per_block-1 < Long(std::numeric_limits::max())); +#else + std::size_t sizeof_tags = ntags*sizeof(TagType); + h_buffer = (char*)The_Pinned_Arena()->alloc(sizeof_tags); + + std::memcpy(h_buffer, tags.data(), sizeof_tags); + + d_tags = reinterpret_cast(h_buffer); + + defined = true; +#endif + } + + void undefine () { + if (defined) { + Gpu::streamSynchronize(); + The_Pinned_Arena()->free(h_buffer); + The_Arena()->free(d_buffer); + h_buffer = nullptr; + d_buffer = nullptr; + d_tags = nullptr; + d_nwarps = nullptr; + ntags = 0; + ntotwarps = 0; + nblocks = 0; + defined = false; + } + } +}; + +namespace detail { + +#ifdef AMREX_USE_GPU + template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -std::enable_if_t().box())>, Box>::value, void> +std::enable_if_t().box())>, Box>, void> tagparfor_call_f ( #ifdef AMREX_USE_SYCL sycl::nd_item<1> const& item, @@ -150,7 +272,7 @@ tagparfor_call_f ( template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -std::enable_if_t().size())> >::value, void> +std::enable_if_t().size())> >, void> tagparfor_call_f ( #ifdef AMREX_USE_SYCL sycl::nd_item<1> const& item, @@ -167,48 +289,19 @@ tagparfor_call_f ( template void -ParallelFor_doit (Vector const& tags, F && f) +ParallelFor_doit (TagVector const& tv, F const& f) { - const int ntags = tags.size(); - if (ntags == 0) { return; } - - Long l_ntotwarps = 0; - int ntotwarps = 0; - Vector nwarps; - nwarps.reserve(ntags+1); - for (int i = 0; i < ntags; ++i) - { - auto& tag = tags[i]; - nwarps.push_back(ntotwarps); - auto nw = (get_tag_size(tag) + Gpu::Device::warp_size-1) / Gpu::Device::warp_size; - l_ntotwarps += nw; - ntotwarps += static_cast(nw); - } - nwarps.push_back(ntotwarps); + AMREX_ALWAYS_ASSERT(tv.is_defined()); - std::size_t sizeof_tags = ntags*sizeof(TagType); - std::size_t offset_nwarps = Arena::align(sizeof_tags); - std::size_t sizeof_nwarps = (ntags+1)*sizeof(int); - std::size_t total_buf_size = offset_nwarps + sizeof_nwarps; + if (tv.ntags == 0) { return; } - char* h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size); - char* d_buffer = (char*)The_Arena()->alloc(total_buf_size); + const auto d_tags = tv.d_tags; + const auto d_nwarps = tv.d_nwarps; + const auto ntags = tv.ntags; + const auto ntotwarps = tv.ntotwarps; + constexpr auto nthreads = TagVector::nthreads; - std::memcpy(h_buffer, tags.data(), sizeof_tags); - std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps); - Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size); - - auto d_tags = reinterpret_cast(d_buffer); - auto d_nwarps = reinterpret_cast(d_buffer+offset_nwarps); - - constexpr int nthreads = 256; - constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size; - int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block; - - amrex::ignore_unused(l_ntotwarps); - AMREX_ASSERT(l_ntotwarps+nwarps_per_block-1 < Long(std::numeric_limits::max())); - - amrex::launch(nblocks, nthreads, Gpu::gpuStream(), + amrex::launch(tv.nblocks, nthreads, Gpu::gpuStream(), #ifdef AMREX_USE_SYCL [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept [[sycl::reqd_work_group_size(nthreads)]] @@ -241,20 +334,60 @@ ParallelFor_doit (Vector const& tags, F && f) tagparfor_call_f( icell, d_tags[tag_id], f); #endif }); +} + +#else // ifdef AMREX_USE_GPU - Gpu::streamSynchronize(); - The_Pinned_Arena()->free(h_buffer); - The_Arena()->free(d_buffer); +template +void +ParallelFor_doit (TagVector const& tv, F const& f) +{ + AMREX_ALWAYS_ASSERT(tv.is_defined()); + + if (tv.ntags == 0) { return; } + + const auto d_tags = tv.d_tags; + const auto ntags = tv.ntags; + +#ifdef AMREX_USE_OMP +#pragma omp parallel +#endif + for (int itag = 0; itag < ntags; ++itag) { + + const auto& t = d_tags[itag]; + + if constexpr (is_box_tag(t)) { + const auto lo = amrex::lbound(t.box()); + const auto hi = amrex::ubound(t.box()); + + for (int k = lo.z; k <= hi.z; ++k) { + for (int j = lo.y; j <= hi.y; ++j) { + AMREX_PRAGMA_SIMD + for (int i = lo.x; i <= hi.x; ++i) { + f(0, 1, i, j, k, t); + } + } + } + } else { + const auto size = t.size(); + + AMREX_PRAGMA_SIMD + for (int i = 0; i < size; ++i) { + f(i, size, t); + } + } + } } +#endif + } template -std::enable_if_t().box())>, - Box>::value> -ParallelFor (Vector const& tags, int ncomp, F && f) +std::enable_if_t().box())>, Box>> +ParallelFor (TagVector const& tv, int ncomp, F const& f) { - detail::ParallelFor_doit(tags, + detail::ParallelFor_doit(tv, [=] AMREX_GPU_DEVICE ( #ifdef AMREX_USE_SYCL sycl::nd_item<1> const& /*item*/, @@ -270,10 +403,10 @@ ParallelFor (Vector const& tags, int ncomp, F && f) } template -std::enable_if_t().box())>, Box>::value, void> -ParallelFor (Vector const& tags, F && f) +std::enable_if_t().box())>, Box>, void> +ParallelFor (TagVector const& tv, F const& f) { - detail::ParallelFor_doit(tags, + detail::ParallelFor_doit(tv, [=] AMREX_GPU_DEVICE ( #ifdef AMREX_USE_SYCL sycl::nd_item<1> const& /*item*/, @@ -287,10 +420,10 @@ ParallelFor (Vector const& tags, F && f) } template -std::enable_if_t().size())> >::value, void> -ParallelFor (Vector const& tags, F && f) +std::enable_if_t().size())> >, void> +ParallelFor (TagVector const& tv, F const& f) { - detail::ParallelFor_doit(tags, + detail::ParallelFor_doit(tv, [=] AMREX_GPU_DEVICE ( #ifdef AMREX_USE_SYCL sycl::nd_item<1> const& /*item*/, @@ -303,7 +436,29 @@ ParallelFor (Vector const& tags, F && f) }); } -#endif +template +std::enable_if_t().box())>, Box>> +ParallelFor (Vector const& tags, int ncomp, F && f) +{ + TagVector tv{tags}; + ParallelFor(tv, ncomp, std::forward(f)); +} + +template +std::enable_if_t().box())>, Box>, void> +ParallelFor (Vector const& tags, F && f) +{ + TagVector tv{tags}; + ParallelFor(tv, std::forward(f)); +} + +template +std::enable_if_t().size())> >, void> +ParallelFor (Vector const& tags, F && f) +{ + TagVector tv{tags}; + ParallelFor(tv, std::forward(f)); +} } diff --git a/Src/Boundary/AMReX_BndryRegister.H b/Src/Boundary/AMReX_BndryRegister.H index 7892218734f..12aa5dd2f93 100644 --- a/Src/Boundary/AMReX_BndryRegister.H +++ b/Src/Boundary/AMReX_BndryRegister.H @@ -267,9 +267,12 @@ BndryRegisterT::copyFrom (const MF& src, int nghost, int src_comp, int dest_comp, int num_comp, const Periodicity& period) { - for (OrientationIter face; face; ++face) - { - bndry[face()].copyFrom(src,nghost,src_comp,dest_comp,num_comp,period); + for (OrientationIter face; face; ++face) { + bndry[face()].multiFab().ParallelCopy_nowait(src,src_comp,dest_comp,num_comp,nghost,0,period); + } + + for (OrientationIter face; face; ++face) { + bndry[face()].multiFab().ParallelCopy_finish(); } return *this; } diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLap_3D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLABecLap_3D_K.H index bb5172396cf..9bdbf9366ee 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLABecLap_3D_K.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLap_3D_K.H @@ -263,6 +263,83 @@ void abec_gsrb (int i, int j, int k, int n, Array4 const& phi, Array4 +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void precompute_abec_gsrb (int i, int j, int k, int n, + Array4 const& pre_comp, + T alpha, Array4 const& a, + T dhx, T dhy, T dhz, + Array4 const& bX, Array4 const& bY, + Array4 const& bZ, + Array4 const& m0, Array4 const& m2, + Array4 const& m4, + Array4 const& m1, Array4 const& m3, + Array4 const& m5, + Array4 const& f0, Array4 const& f2, + Array4 const& f4, + Array4 const& f1, Array4 const& f3, + Array4 const& f5, + Box const& vbox) noexcept +{ + constexpr T omega = T(1.15); + + const auto vlo = amrex::lbound(vbox); + const auto vhi = amrex::ubound(vbox); + + T cf0 = (i == vlo.x && m0(vlo.x-1,j,k) > 0) + ? f0(vlo.x,j,k,n) : T(0.0); + T cf1 = (j == vlo.y && m1(i,vlo.y-1,k) > 0) + ? f1(i,vlo.y,k,n) : T(0.0); + T cf2 = (k == vlo.z && m2(i,j,vlo.z-1) > 0) + ? f2(i,j,vlo.z,n) : T(0.0); + T cf3 = (i == vhi.x && m3(vhi.x+1,j,k) > 0) + ? f3(vhi.x,j,k,n) : T(0.0); + T cf4 = (j == vhi.y && m4(i,vhi.y+1,k) > 0) + ? f4(i,vhi.y,k,n) : T(0.0); + T cf5 = (k == vhi.z && m5(i,j,vhi.z+1) > 0) + ? f5(i,j,vhi.z,n) : T(0.0); + + T gamma = alpha*a(i,j,k) + + dhx*(bX(i,j,k,n)+bX(i+1,j,k,n)) + + dhy*(bY(i,j,k,n)+bY(i,j+1,k,n)) + + dhz*(bZ(i,j,k,n)+bZ(i,j,k+1,n)); + + T g_m_d = gamma + - (dhx*(bX(i,j,k,n)*cf0 + bX(i+1,j,k,n)*cf3) + + dhy*(bY(i,j,k,n)*cf1 + bY(i,j+1,k,n)*cf4) + + dhz*(bZ(i,j,k,n)*cf2 + bZ(i,j,k+1,n)*cf5)); + + T omega_dif_g_m_d = omega / g_m_d; + + pre_comp(i, j, k, 2*n) = gamma; + pre_comp(i, j, k, 2*n+1) = omega_dif_g_m_d; +} + +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void abec_gsrb_v2 (int i, int j, int k, int n, Array4 const& phi, Array4 const& rhs, + Array4 const& pre_comp, + T dhx, T dhy, T dhz, + Array4 const& bX, Array4 const& bY, + Array4 const& bZ, int redblack) noexcept +{ + if ((i+j+k+redblack)%2 == 0) { + + T gamma = pre_comp(i,j,k,2*n); + T omega_dif_g_m_d = pre_comp(i,j,k,2*n+1); + + T rho = dhx*( bX(i ,j,k,n)*phi(i-1,j,k,n) + + bX(i+1,j,k,n)*phi(i+1,j,k,n) ) + + dhy*( bY(i,j ,k,n)*phi(i,j-1,k,n) + + bY(i,j+1,k,n)*phi(i,j+1,k,n) ) + + dhz*( bZ(i,j,k ,n)*phi(i,j,k-1,n) + + bZ(i,j,k+1,n)*phi(i,j,k+1,n) ); + + T res = rhs(i,j,k,n) - (gamma*phi(i,j,k,n) - rho); + phi(i,j,k,n) = phi(i,j,k,n) + omega_dif_g_m_d * res; + } +} + template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void abec_gsrb_os (int i, int j, int k, int n, diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H index 5fcdb85eab6..b993a108279 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H @@ -155,6 +155,7 @@ public: [[nodiscard]] bool isSingular (int amrlev) const override { return m_is_singular[amrlev]; } [[nodiscard]] bool isBottomSingular () const override { return m_is_singular[0]; } void Fapply (int amrlev, int mglev, MF& out, const MF& in) const final; + void PrecomputeSmoothCoeffs (int amrlev, int mglev); void Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, int redblack) const final; void FFlux (int amrlev, const MFIter& mfi, const Array& flux, @@ -194,6 +195,7 @@ public: RT m_b_scalar = std::numeric_limits::quiet_NaN(); Vector > m_a_coeffs; Vector > > m_b_coeffs; + Vector > m_pre_comp; bool m_scalars_set = false; bool m_acoef_set = false; @@ -277,15 +279,22 @@ MLABecLaplacianT::define_ab_coeffs () { m_a_coeffs.resize(this->m_num_amr_levels); m_b_coeffs.resize(this->m_num_amr_levels); + m_pre_comp.resize(this->m_num_amr_levels); for (int amrlev = 0; amrlev < this->m_num_amr_levels; ++amrlev) { m_a_coeffs[amrlev].resize(this->m_num_mg_levels[amrlev]); m_b_coeffs[amrlev].resize(this->m_num_mg_levels[amrlev]); + m_pre_comp[amrlev].resize(this->m_num_mg_levels[amrlev]); for (int mglev = 0; mglev < this->m_num_mg_levels[amrlev]; ++mglev) { m_a_coeffs[amrlev][mglev].define (this->m_grids[amrlev][mglev], this->m_dmap[amrlev][mglev], - 1, 0, MFInfo(), *(this->m_factory[amrlev][mglev])); + 1, 0, MFInfo(), *(this->m_factory[amrlev][mglev])); + + const int pre_comp_nghost = 0; + m_pre_comp[amrlev][mglev].define + (this->m_grids[amrlev][mglev], this->m_dmap[amrlev][mglev], + 2, pre_comp_nghost, MFInfo(), *(this->m_factory[amrlev][mglev])); for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { const BoxArray& ba = amrex::convert(this->m_grids[amrlev][mglev], @@ -436,6 +445,12 @@ MLABecLaplacianT::prepareForSolve () update_singular_flags(); + for (int amrlev = 0; amrlev < this->m_num_amr_levels; ++amrlev) { + for (int mglev = 0; mglev < this->m_num_mg_levels[amrlev]; ++mglev) { + PrecomputeSmoothCoeffs(amrlev, mglev); + } + } + m_needs_update = false; } @@ -841,7 +856,6 @@ MLABecLaplacianT::Fapply (int amrlev, int mglev, MF& out, const MF& in) cons dxinv, ascalar, bscalar); }); } - Gpu::streamSynchronize(); } else #endif { @@ -875,6 +889,88 @@ MLABecLaplacianT::Fapply (int amrlev, int mglev, MF& out, const MF& in) cons } } +template +void +MLABecLaplacianT::PrecomputeSmoothCoeffs ([[maybe_unused]] int amrlev, [[maybe_unused]] int mglev) +{ + BL_PROFILE("MLABecLaplacian::PrecomputeSmoothCoeffs()"); + +#if (AMREX_SPACEDIM == 3) && defined(AMREX_USE_GPU) + bool regular_coarsening = true; + if (amrlev == 0 && mglev > 0) { + regular_coarsening = this->mg_coarsen_ratio_vec[mglev-1] == this->mg_coarsen_ratio; + } + + if (!this->m_use_gauss_seidel || this->m_overset_mask[amrlev][mglev] || + !Gpu::inLaunchRegion() || !regular_coarsening) { + return; + } + + const MF& acoef = m_a_coeffs[amrlev][mglev]; + AMREX_ALWAYS_ASSERT(acoef.nGrowVect() == 0); + + const auto& undrrelxr = this->m_undrrelxr[amrlev][mglev]; + const auto& maskvals = this->m_maskvals [amrlev][mglev]; + + auto& pre_comp = this->m_pre_comp[amrlev][mglev]; + + const int nc = this->getNComp(); + const Real* h = this->m_geom[amrlev][mglev].CellSize(); + AMREX_D_TERM(const RT dhx = m_b_scalar/static_cast(h[0]*h[0]);, + const RT dhy = m_b_scalar/static_cast(h[1]*h[1]);, + const RT dhz = m_b_scalar/static_cast(h[2]*h[2])); + const RT alpha = m_a_scalar; + + const auto& m0ma = maskvals[0].const_arrays(); + const auto& m1ma = maskvals[1].const_arrays(); +#if (AMREX_SPACEDIM > 1) + const auto& m2ma = maskvals[2].const_arrays(); + const auto& m3ma = maskvals[3].const_arrays(); +#if (AMREX_SPACEDIM > 2) + const auto& m4ma = maskvals[4].const_arrays(); + const auto& m5ma = maskvals[5].const_arrays(); +#endif +#endif + + const auto& ama = acoef.const_arrays(); + + AMREX_D_TERM(const auto& bxma = m_b_coeffs[amrlev][mglev][0].const_arrays();, + const auto& byma = m_b_coeffs[amrlev][mglev][1].const_arrays();, + const auto& bzma = m_b_coeffs[amrlev][mglev][2].const_arrays();); + + OrientationIter oitr; + + const auto& f0ma = undrrelxr[oitr()].const_arrays(); ++oitr; + const auto& f1ma = undrrelxr[oitr()].const_arrays(); ++oitr; +#if (AMREX_SPACEDIM > 1) + const auto& f2ma = undrrelxr[oitr()].const_arrays(); ++oitr; + const auto& f3ma = undrrelxr[oitr()].const_arrays(); ++oitr; +#if (AMREX_SPACEDIM > 2) + const auto& f4ma = undrrelxr[oitr()].const_arrays(); ++oitr; + const auto& f5ma = undrrelxr[oitr()].const_arrays(); ++oitr; +#endif +#endif + + const auto& pcma = pre_comp.arrays(); + + ParallelFor(acoef, IntVect(0), nc, + [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n) noexcept + { + Box vbx(ama[box_no]); + precompute_abec_gsrb(i,j,k,n, pcma[box_no], alpha, ama[box_no], + AMREX_D_DECL(dhx, dhy, dhz), + AMREX_D_DECL(bxma[box_no],byma[box_no],bzma[box_no]), + AMREX_D_DECL(m0ma[box_no],m2ma[box_no],m4ma[box_no]), + AMREX_D_DECL(m1ma[box_no],m3ma[box_no],m5ma[box_no]), + AMREX_D_DECL(f0ma[box_no],f2ma[box_no],f4ma[box_no]), + AMREX_D_DECL(f1ma[box_no],f3ma[box_no],f5ma[box_no]), + vbx); + }); + + pre_comp.FillBoundary(); +#endif +} + template void MLABecLaplacianT::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, int redblack) const @@ -924,6 +1020,8 @@ MLABecLaplacianT::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, in #endif #endif + [[maybe_unused]] const auto& pre_comp = this->m_pre_comp[amrlev][mglev]; + const int nc = this->getNComp(); const Real* h = this->m_geom[amrlev][mglev].CellSize(); AMREX_D_TERM(const RT dhx = m_b_scalar/static_cast(h[0]*h[0]);, @@ -965,6 +1063,8 @@ MLABecLaplacianT::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, in #endif #endif + const auto& pcma = pre_comp.const_arrays(); + if (this->m_overset_mask[amrlev][mglev]) { const auto& osmma = this->m_overset_mask[amrlev][mglev]->const_arrays(); if (this->m_use_gauss_seidel) { @@ -1003,6 +1103,12 @@ MLABecLaplacianT::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, in ParallelFor(sol, IntVect(0), nc, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n) noexcept { +#if (AMREX_SPACEDIM == 3) + abec_gsrb_v2(i,j,k,n, solnma[box_no], rhsma[box_no], pcma[box_no], + AMREX_D_DECL(dhx, dhy, dhz), + AMREX_D_DECL(bxma[box_no],byma[box_no],bzma[box_no]), + redblack); +#else Box vbx(ama[box_no]); abec_gsrb(i,j,k,n, solnma[box_no], rhsma[box_no], alpha, ama[box_no], AMREX_D_DECL(dhx, dhy, dhz), @@ -1012,6 +1118,7 @@ MLABecLaplacianT::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, in AMREX_D_DECL(f0ma[box_no],f2ma[box_no],f4ma[box_no]), AMREX_D_DECL(f1ma[box_no],f3ma[box_no],f5ma[box_no]), vbx, redblack); +#endif }); } else { const auto& axma = Ax.const_arrays(); @@ -1031,7 +1138,6 @@ MLABecLaplacianT::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, in }); } } - Gpu::streamSynchronize(); } else #endif { @@ -1271,7 +1377,6 @@ MLABecLaplacianT::normalize (int amrlev, int mglev, MF& mf) const AMREX_D_DECL(bxma[box_no],byma[box_no],bzma[box_no]), dxinv, ascalar, bscalar); }); - Gpu::streamSynchronize(); } else #endif { @@ -1383,7 +1488,6 @@ MLABecLaplacianT::makeNLinOp (int /*grid_size*/) const ama[box_no](i,j,k,n) = huge_alpha; } }); - Gpu::streamSynchronize(); } else #endif { @@ -1434,7 +1538,6 @@ MLABecLaplacianT::copyNSolveSolution (MF& dst, MF const& src) const dstma[box_no](i,j,k,n) = RT(0.0); } }); - Gpu::streamSynchronize(); } else #endif { diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H index 4c21c7f1a47..e7d5f2659b3 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H @@ -15,6 +15,9 @@ namespace amrex { +template +struct MLMGABCTag; + template class MLCellLinOpT // NOLINT(cppcoreguidelines-virtual-class-destructor) : public MLLinOpT @@ -83,7 +86,7 @@ public: void apply (int amrlev, int mglev, MF& out, MF& in, BCMode bc_mode, StateMode s_mode, const MLMGBndryT* bndry=nullptr) const override; void smooth (int amrlev, int mglev, MF& sol, const MF& rhs, - bool skip_fillboundary=false) const final; + bool skip_fillboundary=false, int niter=1) const final; void solutionResidual (int amrlev, MF& resid, MF& x, const MF& b, const MF* crse_bcdata=nullptr) override; @@ -220,6 +223,9 @@ private: void computeVolInv () const; mutable Vector > m_volinv; // used by solvability fix + mutable TagVector> m_bc_tag_vector; + mutable bool m_reuse_bc_tag_vector = false; + int m_interpbndry_halfwidth = 2; }; @@ -728,37 +734,41 @@ MLCellLinOpT::applyBC (int amrlev, int mglev, MF& in, BCMode bc_mode, StateM #ifdef AMREX_USE_GPU if ((cross || tensorop) && Gpu::inLaunchRegion()) { - Vector> tags; - tags.reserve(in.local_size()*AMREX_SPACEDIM*ncomp); - for (MFIter mfi(in); mfi.isValid(); ++mfi) { - const Box& vbx = mfi.validbox(); - const auto& iofab = in.array(mfi); - const auto & bdlv = bcondloc.bndryLocs(mfi); - const auto & bdcv = bcondloc.bndryConds(mfi); + if (!m_reuse_bc_tag_vector || !m_bc_tag_vector.is_defined()) { + Vector> tags; + tags.reserve(in.local_size()*AMREX_SPACEDIM*ncomp); - for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { - if (idim != hidden_direction) { - const Orientation olo(idim,Orientation::low); - const Orientation ohi(idim,Orientation::high); - const auto& bvlo = (bndry != nullptr) ? - bndry->bndryValues(olo).const_array(mfi) : foo; - const auto& bvhi = (bndry != nullptr) ? - bndry->bndryValues(ohi).const_array(mfi) : foo; - for (int icomp = 0; icomp < ncomp; ++icomp) { - tags.emplace_back(MLMGABCTag{iofab, bvlo, bvhi, - maskvals[olo].const_array(mfi), - maskvals[ohi].const_array(mfi), - bdlv[icomp][olo], bdlv[icomp][ohi], - amrex::adjCell(vbx,olo), - bdcv[icomp][olo], bdcv[icomp][ohi], - vbx.length(idim), icomp, idim}); + for (MFIter mfi(in); mfi.isValid(); ++mfi) { + const Box& vbx = mfi.validbox(); + const auto& iofab = in.array(mfi); + const auto & bdlv = bcondloc.bndryLocs(mfi); + const auto & bdcv = bcondloc.bndryConds(mfi); + + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { + if (idim != hidden_direction) { + const Orientation olo(idim,Orientation::low); + const Orientation ohi(idim,Orientation::high); + const auto& bvlo = (bndry != nullptr) ? + bndry->bndryValues(olo).const_array(mfi) : foo; + const auto& bvhi = (bndry != nullptr) ? + bndry->bndryValues(ohi).const_array(mfi) : foo; + for (int icomp = 0; icomp < ncomp; ++icomp) { + tags.emplace_back(MLMGABCTag{iofab, bvlo, bvhi, + maskvals[olo].const_array(mfi), + maskvals[ohi].const_array(mfi), + bdlv[icomp][olo], bdlv[icomp][ohi], + amrex::adjCell(vbx,olo), + bdcv[icomp][olo], bdcv[icomp][ohi], + vbx.length(idim), icomp, idim}); + } } } } + m_bc_tag_vector.define(tags); } - ParallelFor(tags, + ParallelFor(m_bc_tag_vector, [=] AMREX_GPU_DEVICE (int i, int j, int k, MLMGABCTag const& tag) noexcept { if (tag.dir == 0) @@ -795,6 +805,11 @@ MLCellLinOpT::applyBC (int amrlev, int mglev, MF& in, BCMode bc_mode, StateM #endif #endif }); + + if (!m_reuse_bc_tag_vector) { + m_bc_tag_vector.undefine(); + } + } else #endif if (cross || tensorop) @@ -1211,16 +1226,37 @@ MLCellLinOpT::apply (int amrlev, int mglev, MF& out, MF& in, BCMode bc_mode, template void MLCellLinOpT::smooth (int amrlev, int mglev, MF& sol, const MF& rhs, - bool skip_fillboundary) const + bool skip_fillboundary, int niter) const { BL_PROFILE("MLCellLinOp::smooth()"); - for (int redblack = 0; redblack < 2; ++redblack) - { - applyBC(amrlev, mglev, sol, BCMode::Homogeneous, StateMode::Solution, - nullptr, skip_fillboundary); - Fsmooth(amrlev, mglev, sol, rhs, redblack); - skip_fillboundary = false; + + m_reuse_bc_tag_vector = true; + + int ghost_cells_valid = skip_fillboundary ? 1 : 0; + + for (int i = 0; i < niter; ++i) { + for (int redblack = 0; redblack < 2; ++redblack) + { + if (ghost_cells_valid == 0) { + const int ncomp = this->getNComp(); + const int cross = false; //isCrossStencil(); + const int ngrow = sol.nGrowVect().min(); + //std::cout << "ncomp " << ncomp << " ngrow " << ngrow << " cross " << cross << std::endl; + sol.FillBoundary(0, ncomp, IntVect(ngrow), this->m_geom[amrlev][mglev].periodicity(), cross); + ghost_cells_valid = ngrow; + //sol.nGrowVect().min(); + } + + applyBC(amrlev, mglev, sol, BCMode::Homogeneous, StateMode::Solution, + nullptr, true); + Fsmooth(amrlev, mglev, sol, rhs, redblack); + + --ghost_cells_valid; + } } + + m_reuse_bc_tag_vector = false; + m_bc_tag_vector.undefine(); } template diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.H b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.H index 1d6aa224017..2ef9f8d7ca3 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.H @@ -73,7 +73,7 @@ public: StateMode s_mode, const MLMGBndryT* bndry=nullptr) const override; void smooth (int amrlev, int mglev, MF& sol, const MF& rhs, - bool skip_fillboundary=false) const override; + bool skip_fillboundary=false, int niter=1) const override; void solutionResidual (int amrlev, MF& resid, MF& x, const MF& b, const MF* crse_bcdata=nullptr) override; diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.cpp b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.cpp index 3e500351c91..e38540d0176 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLCurlCurl.cpp @@ -319,7 +319,7 @@ MLCurlCurl::apply (int amrlev, int mglev, MF& out, MF& in, BCMode /*bc_mode*/, } void MLCurlCurl::smooth (int amrlev, int mglev, MF& sol, const MF& rhs, - bool skip_fillboundary) const + bool skip_fillboundary, int niter) const { AMREX_ASSERT(rhs[0].nGrowVect().allGE(1)); @@ -331,16 +331,18 @@ void MLCurlCurl::smooth (int amrlev, int mglev, MF& sol, const MF& rhs, int ncolors = 4; #endif - for (int color = 0; color < ncolors; ++color) { - if (!skip_fillboundary) { - applyBC(amrlev, mglev, sol, CurlCurlStateType::x); - } - skip_fillboundary = false; + for (int i = 0; i < niter; ++i) { + for (int color = 0; color < ncolors; ++color) { + if (!skip_fillboundary) { + applyBC(amrlev, mglev, sol, CurlCurlStateType::x); + } + skip_fillboundary = false; #if (AMREX_SPACEDIM == 1) - smooth1D(amrlev, mglev, sol, rhs, color); + smooth1D(amrlev, mglev, sol, rhs, color); #else - smooth4(amrlev, mglev, sol, rhs, color); + smooth4(amrlev, mglev, sol, rhs, color); #endif + } } } diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp index 61580e4d368..53549cbec40 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp @@ -302,8 +302,6 @@ MLEBNodeFDLaplacian::prepareForSolve () AMREX_ASSERT(!isBottomSingular()); - Gpu::streamSynchronize(); - #if (AMREX_SPACEDIM == 2) if (m_rz) { if (m_geom[0][0].ProbLo(0) == 0._rt) { diff --git a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H index 08fa588b4a0..ccc017c6c26 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H @@ -355,9 +355,10 @@ public: * \param sol unknowns * \param rhs RHS * \param skip_fillboundary flag controlling whether ghost cell filling can be skipped. + * \param niter number of (multicolor) smoothing iterations */ virtual void smooth (int amrlev, int mglev, MF& sol, const MF& rhs, - bool skip_fillboundary=false) const = 0; + bool skip_fillboundary=false, int niter=1) const = 0; //! Divide mf by the diagonal component of the operator. Used by bicgstab. virtual void normalize (int amrlev, int mglev, MF& mf) const { diff --git a/Src/LinearSolvers/MLMG/AMReX_MLMG.H b/Src/LinearSolvers/MLMG/AMReX_MLMG.H index 4b666f2c03d..01a1de73f59 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLMG.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLMG.H @@ -1317,10 +1317,7 @@ MLMGT::mgVcycle (int amrlev, int mglev_top) setVal(cor[amrlev][mglev], RT(0.0)); bool skip_fillboundary = true; - for (int i = 0; i < nu1; ++i) { - linop.smooth(amrlev, mglev, cor[amrlev][mglev], res[amrlev][mglev], skip_fillboundary); - skip_fillboundary = false; - } + linop.smooth(amrlev, mglev, cor[amrlev][mglev], res[amrlev][mglev], skip_fillboundary, nu1); // rescor = res - L(cor) computeResOfCorrection(amrlev, mglev); @@ -1364,11 +1361,8 @@ MLMGT::mgVcycle (int amrlev, int mglev_top) } setVal(cor[amrlev][mglev_bottom], RT(0.0)); bool skip_fillboundary = true; - for (int i = 0; i < nu1; ++i) { - linop.smooth(amrlev, mglev_bottom, cor[amrlev][mglev_bottom], - res[amrlev][mglev_bottom], skip_fillboundary); - skip_fillboundary = false; - } + linop.smooth(amrlev, mglev_bottom, cor[amrlev][mglev_bottom], + res[amrlev][mglev_bottom], skip_fillboundary, nu1); if (verbose >= 4) { computeResOfCorrection(amrlev, mglev_bottom); @@ -1391,9 +1385,7 @@ MLMGT::mgVcycle (int amrlev, int mglev_top) amrex::Print() << print_ident << "AT LEVEL " << amrlev << " " << mglev << " UP: Norm before smooth " << norm << "\n"; } - for (int i = 0; i < nu2; ++i) { - linop.smooth(amrlev, mglev, cor[amrlev][mglev], res[amrlev][mglev]); - } + linop.smooth(amrlev, mglev, cor[amrlev][mglev], res[amrlev][mglev], false, nu2); if (cf_strategy == CFStrategy::ghostnodes) { computeResOfCorrection(amrlev, mglev); } @@ -1512,10 +1504,7 @@ MLMGT::actualBottomSolve () if (bottom_solver == BottomSolver::smoother) { bool skip_fillboundary = true; - for (int i = 0; i < nuf; ++i) { - linop.smooth(amrlev, mglev, x, b, skip_fillboundary); - skip_fillboundary = false; - } + linop.smooth(amrlev, mglev, x, b, skip_fillboundary, nuf); } else { @@ -1589,9 +1578,7 @@ MLMGT::actualBottomSolve () setVal(cor[amrlev][mglev], RT(0.0)); } const int n = (ret==0) ? nub : nuf; - for (int i = 0; i < n; ++i) { - linop.smooth(amrlev, mglev, x, b); - } + linop.smooth(amrlev, mglev, x, b, false, n); } } diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLaplacian.cpp index 0fda51eac14..7ead526ee15 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLaplacian.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeABecLaplacian.cpp @@ -109,7 +109,6 @@ MLNodeABecLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFa yarr_ma[box_no](i,j,k) = (dmskarr_ma[box_no](i,j,k)) ? Real(0.0) : alpha*acoef_ma[box_no](i,j,k)*xarr_ma[box_no](i,j,k) - beta*lap; }); - Gpu::streamSynchronize(); } void @@ -145,7 +144,6 @@ MLNodeABecLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiF acoef_ma[box_no], bcoef_ma[box_no], dmskarr_ma[box_no], dxinvarr); }); - Gpu::streamSynchronize(); if (m_smooth_num_sweeps > 1) { nodalSync(amrlev, mglev, sol); } } #else @@ -193,7 +191,6 @@ MLNodeABecLaplacian::restriction (int amrlev, int cmglev, MultiFab& crse, MultiF { mlndlap_restriction(i,j,k,pcrse_ma[box_no],fine_ma[box_no],msk_ma[box_no]); }); - Gpu::streamSynchronize(); if (need_parallel_copy) { crse.ParallelCopy(cfine); @@ -225,7 +222,6 @@ MLNodeABecLaplacian::interpolation (int amrlev, int fmglev, MultiFab& fine, cons mlndlap_interpadd_aa(i, j, k, fine_ma[box_no], crse_ma[box_no], sig_ma[box_no], msk_ma[box_no]); }); - Gpu::streamSynchronize(); } void @@ -280,7 +276,6 @@ MLNodeABecLaplacian::fixUpResidualMask (int amrlev, iMultiFab& resmsk) { if (fmsk[bno](i,j,k) == nodelap_detail::crse_fine_node) { rmsk[bno](i,j,k) = 1; } }); - Gpu::streamSynchronize(); } void diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp index 8e490f30348..d6632e0ee0e 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp @@ -265,7 +265,6 @@ MLNodeLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& i #endif }); } - Gpu::streamSynchronize(); } else #endif { @@ -558,7 +557,6 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& } } - Gpu::streamSynchronize(); nodalSync(amrlev, mglev, sol); } else @@ -700,8 +698,6 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& } } } - - Gpu::streamSynchronize(); } } diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H index a9c1a65b525..8833974de03 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H @@ -45,7 +45,7 @@ public: StateMode s_mode, const MLMGBndry* bndry=nullptr) const final; void smooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& rhs, - bool skip_fillboundary=false) const override; + bool skip_fillboundary=false, int niter=1) const override; void solutionResidual (int amrlev, MultiFab& resid, MultiFab& x, const MultiFab& b, const MultiFab* crse_bcdata=nullptr) override; diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp index 9719ac3d3e8..a8d2453fd71 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp @@ -167,12 +167,15 @@ MLNodeLinOp::apply (int amrlev, int mglev, MultiFab& out, MultiFab& in, BCMode b void MLNodeLinOp::smooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& rhs, - bool skip_fillboundary) const + bool skip_fillboundary, int niter) const { - if (!skip_fillboundary) { - applyBC(amrlev, mglev, sol, BCMode::Homogeneous, StateMode::Correction); + for (int i = 0; i < niter; ++i) { + if (!skip_fillboundary) { + applyBC(amrlev, mglev, sol, BCMode::Homogeneous, StateMode::Correction); + } + Fsmooth(amrlev, mglev, sol, rhs); + skip_fillboundary = false; } - Fsmooth(amrlev, mglev, sol, rhs); } Real diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.H index 03a73cae86d..2f9b34a030b 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.H @@ -58,7 +58,7 @@ public: MultiFab& fine_res, MultiFab& fine_sol, const MultiFab& fine_rhs) const final; void smooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& rhs, - bool skip_fillboundary=false) const final; + bool skip_fillboundary=false, int niter=1) const final; void prepareForSolve () final; void Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& in) const final; diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp index e8135aeba14..1833c8e566a 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp @@ -219,24 +219,25 @@ MLNodeTensorLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const Multi { mlndtslap_adotx(i,j,k, out_a[box_no], in_a[box_no], dmsk_a[box_no], s); }); - Gpu::streamSynchronize(); #endif } void MLNodeTensorLaplacian::smooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& rhs, - bool skip_fillboundary) const + bool skip_fillboundary, int niter) const { BL_PROFILE("MLNodeTensorLaplacian::smooth()"); - for (int redblack = 0; redblack < 4; ++redblack) { - if (!skip_fillboundary) { - applyBC(amrlev, mglev, sol, BCMode::Homogeneous, StateMode::Correction); + for (int i = 0; i < niter; ++i) { + for (int redblack = 0; redblack < 4; ++redblack) { + if (!skip_fillboundary) { + applyBC(amrlev, mglev, sol, BCMode::Homogeneous, StateMode::Correction); + } + m_redblack = redblack; + Fsmooth(amrlev, mglev, sol, rhs); + skip_fillboundary = false; } - m_redblack = redblack; - Fsmooth(amrlev, mglev, sol, rhs); - skip_fillboundary = false; + nodalSync(amrlev, mglev, sol); } - nodalSync(amrlev, mglev, sol); } void @@ -261,7 +262,6 @@ MLNodeTensorLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const Mult mlndtslap_gauss_seidel(i, j, k, sol_a[box_no], rhs_a[box_no], dmsk_a[box_no], s); } }); - Gpu::streamSynchronize(); #endif } diff --git a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H index 679a1c9e583..7e693ad9b64 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H @@ -235,7 +235,6 @@ MLPoissonT::Fapply (int amrlev, int mglev, MF& out, const MF& in) const }); } } - Gpu::streamSynchronize(); } else #endif { @@ -333,7 +332,6 @@ MLPoissonT::normalize (int amrlev, int mglev, MF& mf) const { mlpoisson_normalize(i,j,k, ma[box_no], AMREX_D_DECL(dhx,dhy,dhz), dx, probxlo); }); - Gpu::streamSynchronize(); } else #endif {