diff --git a/Examples/Tests/open_bc_poisson_solver/CMakeLists.txt b/Examples/Tests/open_bc_poisson_solver/CMakeLists.txt
index c5ec4583da1..d6141f0b4ab 100644
--- a/Examples/Tests/open_bc_poisson_solver/CMakeLists.txt
+++ b/Examples/Tests/open_bc_poisson_solver/CMakeLists.txt
@@ -12,3 +12,15 @@ if(WarpX_FFT)
         OFF  # dependency
     )
 endif()
+
+if(WarpX_HEFFTE)
+    add_warpx_test(
+        test_3d_open_bc_poisson_solver_heffte # name
+        3  # dims
+        2  # nprocs
+        inputs_test_3d_open_bc_poisson_solver_heffte  # inputs
+        analysis.py  # analysis
+        diags/diag1000001  # output
+        OFF  # dependency
+    )
+endif()
diff --git a/Examples/Tests/open_bc_poisson_solver/inputs_test_3d_open_bc_poisson_solver_heffte b/Examples/Tests/open_bc_poisson_solver/inputs_test_3d_open_bc_poisson_solver_heffte
new file mode 100644
index 00000000000..4f0a50df037
--- /dev/null
+++ b/Examples/Tests/open_bc_poisson_solver/inputs_test_3d_open_bc_poisson_solver_heffte
@@ -0,0 +1 @@
+FILE = inputs_test_3d_open_bc_poisson_solver
diff --git a/GNUmakefile b/GNUmakefile
index 86bdab2709f..fe10983b780 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -38,6 +38,7 @@ USE_OPENPMD = FALSE
 WarpxBinDir = Bin
 
 USE_FFT = FALSE
+USE_HEFFTE = FALSE
 USE_RZ = FALSE
 
 USE_EB = FALSE
diff --git a/Regression/Checksum/benchmarks_json/test_3d_open_bc_poisson_solver.json b/Regression/Checksum/benchmarks_json/test_3d_open_bc_poisson_solver.json
index 0453481ec60..0ca6bde570a 100644
--- a/Regression/Checksum/benchmarks_json/test_3d_open_bc_poisson_solver.json
+++ b/Regression/Checksum/benchmarks_json/test_3d_open_bc_poisson_solver.json
@@ -1,20 +1,20 @@
 {
   "lev=0": {
-    "Bx": 100915975.15792876,
-    "By": 157610677.31483692,
-    "Bz": 2.404060922276648e-13,
-    "Ex": 4.725066923361703e+16,
-    "Ey": 3.0253961494347724e+16,
-    "Ez": 3276584.4383433666,
+    "Bx": 100915975.15403552,
+    "By": 157610677.3147734,
+    "Bz": 1.2276713711194638e-13,
+    "Ex": 4.725066923359797e+16,
+    "Ey": 3.025396149317578e+16,
+    "Ez": 3276584.4383433824,
     "rho": 10994013582437.197
   },
   "electron": {
-    "particle_momentum_x": 5.701279599504008e-19,
-    "particle_momentum_y": 3.650453172860547e-19,
+    "particle_momentum_x": 5.701279599509506e-19,
+    "particle_momentum_y": 3.650453172383178e-19,
     "particle_momentum_z": 1.145432768297242e-10,
     "particle_position_x": 17.31408691249785,
-    "particle_position_y": 0.2583691267187801,
+    "particle_position_y": 0.25836912671878015,
     "particle_position_z": 10066.329600000008,
     "particle_weight": 19969036501.910976
   }
-}
+}
\ No newline at end of file
diff --git a/Source/ablastr/fields/IntegratedGreenFunctionSolver.H b/Source/ablastr/fields/IntegratedGreenFunctionSolver.H
index 97ffdb5ac36..28885e167a3 100644
--- a/Source/ablastr/fields/IntegratedGreenFunctionSolver.H
+++ b/Source/ablastr/fields/IntegratedGreenFunctionSolver.H
@@ -7,6 +7,8 @@
 #ifndef ABLASTR_IGF_SOLVER_H
 #define ABLASTR_IGF_SOLVER_H
 
+#include <ablastr/constant.H>
+
 #include <AMReX_BoxArray.H>
 #include <AMReX_GpuQualifiers.H>
 #include <AMReX_MultiFab.H>
@@ -47,6 +49,35 @@ namespace ablastr::fields
         return G;
     }
 
+    /** @brief add
+     *
+     * @param[in] x x-coordinate of given location
+     * @param[in] y y-coordinate of given location
+     * @param[in] z z-coordinate of given location
+     *
+     * @return the sum of integrated Green function G
+     */
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    amrex::Real
+    SumOfIntegratedPotential (amrex::Real x, amrex::Real y, amrex::Real z, amrex::Real dx, amrex::Real dy, amrex::Real dz)
+    {
+        using namespace amrex::literals;
+
+
+        amrex::Real const G_value = 1._rt/(4._rt*ablastr::constant::math::pi*ablastr::constant::SI::ep0) * (
+            IntegratedPotential( x+0.5_rt*dx, y+0.5_rt*dy, z+0.5_rt*dz )
+          - IntegratedPotential( x-0.5_rt*dx, y+0.5_rt*dy, z+0.5_rt*dz )
+          - IntegratedPotential( x+0.5_rt*dx, y-0.5_rt*dy, z+0.5_rt*dz )
+          + IntegratedPotential( x-0.5_rt*dx, y-0.5_rt*dy, z+0.5_rt*dz )
+          - IntegratedPotential( x+0.5_rt*dx, y+0.5_rt*dy, z-0.5_rt*dz )
+          + IntegratedPotential( x-0.5_rt*dx, y+0.5_rt*dy, z-0.5_rt*dz )
+          + IntegratedPotential( x+0.5_rt*dx, y-0.5_rt*dy, z-0.5_rt*dz )
+          - IntegratedPotential( x-0.5_rt*dx, y-0.5_rt*dy, z-0.5_rt*dz )
+        );
+
+        return G_value;
+    }
+
     /** @brief Compute the electrostatic potential using the Integrated Green Function method
      *         as in http://dx.doi.org/10.1103/PhysRevSTAB.9.044204
      *
diff --git a/Source/ablastr/fields/IntegratedGreenFunctionSolver.cpp b/Source/ablastr/fields/IntegratedGreenFunctionSolver.cpp
index 1aeee9d81d2..5b9aa940d6a 100644
--- a/Source/ablastr/fields/IntegratedGreenFunctionSolver.cpp
+++ b/Source/ablastr/fields/IntegratedGreenFunctionSolver.cpp
@@ -27,7 +27,9 @@
 #include <AMReX_MultiFab.H>
 #include <AMReX_REAL.H>
 
-#include <array>
+#if defined(ABLASTR_USE_FFT) && defined(ABLASTR_USE_HEFFTE)
+#include <heffte.h>
+#endif
 
 
 namespace ablastr::fields {
@@ -36,10 +38,16 @@ void
 computePhiIGF ( amrex::MultiFab const & rho,
                 amrex::MultiFab & phi,
                 std::array<amrex::Real, 3> const & cell_size,
-                amrex::BoxArray const & ba )
+                amrex::BoxArray const & ba)
 {
     using namespace amrex::literals;
 
+    BL_PROFILE_VAR_NS("ablastr::fields::computePhiIGF: FFTs", timer_ffts);
+    BL_PROFILE_VAR_NS("ablastr::fields::computePhiIGF: FFT plans", timer_plans);
+    BL_PROFILE_VAR_NS("ablastr::fields::computePhiIGF: parallel copies", timer_pcopies);
+
+    BL_PROFILE("ablastr::fields::computePhiIGF");
+
     // Define box that encompasses the full domain
     amrex::Box domain = ba.minimalBox();
     domain.surroundingNodes(); // get nodal points, since `phi` and `rho` are nodal
@@ -50,41 +58,87 @@ computePhiIGF ( amrex::MultiFab const & rho,
     int const nz = domain.length(2);
 
     // Allocate 2x wider arrays for the convolution of rho with the Green function
-    // This also defines the box arrays for the global FFT: contains only one box;
     amrex::Box const realspace_box = amrex::Box(
         {domain.smallEnd(0), domain.smallEnd(1), domain.smallEnd(2)},
         {2*nx-1+domain.smallEnd(0), 2*ny-1+domain.smallEnd(1), 2*nz-1+domain.smallEnd(2)},
         amrex::IntVect::TheNodeVector() );
+
+#if !defined(ABLASTR_USE_HEFFTE)
+    // Without distributed FFTs (i.e. without heFFTe):
+    // allocate the 2x wider array on a single box
     amrex::BoxArray const realspace_ba = amrex::BoxArray( realspace_box );
-    amrex::Box const spectralspace_box = amrex::Box(
-        {0,0,0},
-        {nx, 2*ny-1, 2*nz-1},
-        amrex::IntVect::TheNodeVector() );
-    amrex::BoxArray const spectralspace_ba = amrex::BoxArray( spectralspace_box );
     // Define a distribution mapping for the global FFT, with only one box
     amrex::DistributionMapping dm_global_fft;
     dm_global_fft.define( realspace_ba );
+#elif defined(ABLASTR_USE_HEFFTE)
+    // With distributed FFTs (i.e. with heFFTe):
+    // Define a new distribution mapping which is decomposed purely along z
+    // and has one box per MPI rank
+    int const nprocs = amrex::ParallelDescriptor::NProcs();
+    amrex::BoxArray realspace_ba;
+    amrex::DistributionMapping dm_global_fft;
+    {
+        int realspace_nx = realspace_box.length(0);
+        int realspace_ny = realspace_box.length(1);
+        int realspace_nz = realspace_box.length(2);
+        int minsize_z = realspace_nz / nprocs;
+        int nleft_z = realspace_nz - minsize_z*nprocs;
+
+        AMREX_ALWAYS_ASSERT(realspace_nz >= nprocs);
+        // We are going to split realspace_box in such a way that the first
+        // nleft boxes has minsize_z+1 nodes and the others minsize
+        // nodes. We do it this way instead of BoxArray::maxSize to make
+        // sure there are exactly nprocs boxes and there are no overlaps.
+        amrex::BoxList bl(amrex::IndexType::TheNodeType());
+        for (int iproc = 0; iproc < nprocs; ++iproc) {
+            int zlo, zhi;
+            if (iproc < nleft_z) {
+                zlo = iproc*(minsize_z+1);
+                zhi = zlo + minsize_z;
+
+            } else {
+                zlo = iproc*minsize_z + nleft_z;
+                zhi = zlo + minsize_z - 1;
+
+            }
+            amrex::Box tbx(amrex::IntVect(0,0,zlo),amrex::IntVect(realspace_nx-1,realspace_ny-1,zhi),amrex::IntVect(1));
+
+            tbx.shift(realspace_box.smallEnd());
+            bl.push_back(tbx);
+        }
+        realspace_ba.define(std::move(bl));
+        amrex::Vector<int> pmap(nprocs);
+        std::iota(pmap.begin(), pmap.end(), 0);
+        dm_global_fft.define(std::move(pmap));
+    }
+#endif
+
     // Allocate required arrays
     amrex::MultiFab tmp_rho = amrex::MultiFab(realspace_ba, dm_global_fft, 1, 0);
     tmp_rho.setVal(0);
     amrex::MultiFab tmp_G = amrex::MultiFab(realspace_ba, dm_global_fft, 1, 0);
     tmp_G.setVal(0);
-    // Allocate corresponding arrays in Fourier space
-    using SpectralField = amrex::FabArray< amrex::BaseFab< amrex::GpuComplex< amrex::Real > > >;
-    SpectralField tmp_rho_fft = SpectralField( spectralspace_ba, dm_global_fft, 1, 0 );
-    SpectralField tmp_G_fft = SpectralField( spectralspace_ba, dm_global_fft, 1, 0 );
 
-    // Copy from rho to tmp_rho
+    BL_PROFILE_VAR_START(timer_pcopies);
+    // Copy from rho including its ghost cells to tmp_rho
     tmp_rho.ParallelCopy( rho, 0, 0, 1, rho.nGrowVect(), amrex::IntVect::TheZeroVector() );
+    BL_PROFILE_VAR_STOP(timer_pcopies);
+
+#if !defined(ABLASTR_USE_HEFFTE)
+    // Without distributed FFTs (i.e. without heFFTe):
+    // We loop over the original box (not the 2x wider one), and the other quadrants by periodicity
+    amrex::BoxArray const& igf_compute_box = amrex::BoxArray( domain );
+#else
+    // With distributed FFTs (i.e. with heFFTe):
+    // We loop over the full 2x wider box, since 1 MPI rank does not necessarily own the data for the other quadrants
+    amrex::BoxArray const& igf_compute_box = tmp_G.boxArray();
+#endif
 
     // Compute the integrated Green function
-    {
-    BL_PROFILE("Initialize Green function");
-    amrex::BoxArray const domain_ba = amrex::BoxArray( domain );
 #ifdef AMREX_USE_OMP
 #pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
 #endif
-    for (amrex::MFIter mfi(domain_ba, dm_global_fft,amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) {
+    for (amrex::MFIter mfi(igf_compute_box, dm_global_fft, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) {
 
         amrex::Box const bx = mfi.tilebox();
 
@@ -95,6 +149,7 @@ computePhiIGF ( amrex::MultiFab const & rho,
         amrex::Real const dx = cell_size[0];
         amrex::Real const dy = cell_size[1];
         amrex::Real const dz = cell_size[2];
+
         amrex::Array4<amrex::Real> const tmp_G_arr = tmp_G.array(mfi);
         amrex::ParallelFor( bx,
             [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
@@ -106,17 +161,9 @@ computePhiIGF ( amrex::MultiFab const & rho,
                 amrex::Real const y = j0*dy;
                 amrex::Real const z = k0*dz;
 
-                amrex::Real const G_value = 1._rt/(4._rt*ablastr::constant::math::pi*ablastr::constant::SI::ep0) * (
-                    IntegratedPotential( x+0.5_rt*dx, y+0.5_rt*dy, z+0.5_rt*dz )
-                  - IntegratedPotential( x-0.5_rt*dx, y+0.5_rt*dy, z+0.5_rt*dz )
-                  - IntegratedPotential( x+0.5_rt*dx, y-0.5_rt*dy, z+0.5_rt*dz )
-                  - IntegratedPotential( x+0.5_rt*dx, y+0.5_rt*dy, z-0.5_rt*dz )
-                  + IntegratedPotential( x+0.5_rt*dx, y-0.5_rt*dy, z-0.5_rt*dz )
-                  + IntegratedPotential( x-0.5_rt*dx, y+0.5_rt*dy, z-0.5_rt*dz )
-                  + IntegratedPotential( x-0.5_rt*dx, y-0.5_rt*dy, z+0.5_rt*dz )
-                  - IntegratedPotential( x-0.5_rt*dx, y-0.5_rt*dy, z-0.5_rt*dz )
-                );
-
+#if !defined(ABLASTR_USE_HEFFTE)
+                // Without distributed FFTs (i.e. without heFFTe):
+                amrex::Real const G_value = SumOfIntegratedPotential(x     , y     , z     , dx, dy, dz);
                 tmp_G_arr(i,j,k) = G_value;
                 // Fill the rest of the array by periodicity
                 if (i0>0) {tmp_G_arr(hi[0]+1-i0, j         , k         ) = G_value;}
@@ -126,71 +173,120 @@ computePhiIGF ( amrex::MultiFab const & rho,
                 if ((j0>0)&&(k0>0)) {tmp_G_arr(i         , hi[1]+1-j0, hi[2]+1-k0) = G_value;}
                 if ((i0>0)&&(k0>0)) {tmp_G_arr(hi[0]+1-i0, j         , hi[2]+1-k0) = G_value;}
                 if ((i0>0)&&(j0>0)&&(k0>0)) {tmp_G_arr(hi[0]+1-i0, hi[1]+1-j0, hi[2]+1-k0) = G_value;}
-            }
-        );
-    }
+#else
+                // With distributed FFTs (i.e. with heFFTe):
+                amrex::Real x_hi = dx*(hi[0]+2);
+                amrex::Real y_hi = dy*(hi[1]+2);
+                amrex::Real z_hi = dz*(hi[2]+2);
+                if ((i0< nx)&&(j0< ny)&&(k0< nz)) { tmp_G_arr(i,j,k) = SumOfIntegratedPotential(x     , y     , z     , dx, dy, dz); }
+                if ((i0< nx)&&(j0> ny)&&(k0< nz)) { tmp_G_arr(i,j,k) = SumOfIntegratedPotential(x     , y_hi-y, z     , dx, dy, dz); }
+                if ((i0< nx)&&(j0< ny)&&(k0> nz)) { tmp_G_arr(i,j,k) = SumOfIntegratedPotential(x     , y     , z_hi-z, dx, dy, dz); }
+                if ((i0> nx)&&(j0> ny)&&(k0< nz)) { tmp_G_arr(i,j,k) = SumOfIntegratedPotential(x_hi-x, y_hi-y, z     , dx, dy, dz); }
+                if ((i0< nx)&&(j0> ny)&&(k0> nz)) { tmp_G_arr(i,j,k) = SumOfIntegratedPotential(x     , y_hi-y, z_hi-z, dx, dy, dz); }
+                if ((i0> nx)&&(j0< ny)&&(k0> nz)) { tmp_G_arr(i,j,k) = SumOfIntegratedPotential(x_hi-x, y     , z_hi-z, dx, dy, dz); }
+                if ((i0> nx)&&(j0> ny)&&(k0> nz)) { tmp_G_arr(i,j,k) = SumOfIntegratedPotential(x_hi-x, y_hi-y, z_hi-z, dx, dy, dz); }
+                if ((i0> nx)&&(j0< ny)&&(k0< nz)) { tmp_G_arr(i,j,k) = SumOfIntegratedPotential(x_hi-x, y     , z     , dx, dy, dz); }
+#endif
+         }
+      );
     }
 
-    // Perform forward FFTs
-    auto forward_plan_rho = ablastr::math::anyfft::FFTplans(spectralspace_ba, dm_global_fft);
-    auto forward_plan_G = ablastr::math::anyfft::FFTplans(spectralspace_ba, dm_global_fft);
-    // Loop over boxes perform FFTs
-    for ( amrex::MFIter mfi(realspace_ba, dm_global_fft); mfi.isValid(); ++mfi ){
-
-        // Note: the size of the real-space box and spectral-space box
-        // differ when using real-to-complex FFT. When initializing
-        // the FFT plan, the valid dimensions are those of the real-space box.
-        const amrex::IntVect fft_size = realspace_ba[mfi].length();
-
-        // FFT of rho
-        forward_plan_rho[mfi] = ablastr::math::anyfft::CreatePlan(
-            fft_size, tmp_rho[mfi].dataPtr(),
-            reinterpret_cast<ablastr::math::anyfft::Complex*>(tmp_rho_fft[mfi].dataPtr()),
-            ablastr::math::anyfft::direction::R2C, AMREX_SPACEDIM);
-        ablastr::math::anyfft::Execute(forward_plan_rho[mfi]);
+    // Prepare to perform global FFT
+    // Since there is 1 MPI rank per box, here each MPI rank obtains its local box and the associated boxid
+    int local_boxid = amrex::ParallelDescriptor::MyProc(); // because of how we made the DistributionMapping
+    if (local_boxid < realspace_ba.size()) {
+        // When not using heFFTe, there is only one box (the global box)
+        // It is taken care of my MPI rank 0 ; other ranks have no work (hence the if condition)
 
-        // FFT of G
-        forward_plan_G[mfi] = ablastr::math::anyfft::CreatePlan(
-            fft_size, tmp_G[mfi].dataPtr(),
-            reinterpret_cast<ablastr::math::anyfft::Complex*>(tmp_G_fft[mfi].dataPtr()),
-            ablastr::math::anyfft::direction::R2C, AMREX_SPACEDIM);
-        ablastr::math::anyfft::Execute(forward_plan_G[mfi]);
+        amrex::Box local_nodal_box = realspace_ba[local_boxid];
+        amrex::Box local_box(local_nodal_box.smallEnd(), local_nodal_box.bigEnd());
+        local_box.shift(-realspace_box.smallEnd()); // This simplifies the setup because the global lo is zero now
+        // Since we the domain decompostion is in the z-direction, setting up c_local_box is simple.
+        amrex::Box c_local_box = local_box;
+        c_local_box.setBig(0, local_box.length(0)/2+1);
 
-    }
+        // Allocate array in spectral space
+        using SpectralField = amrex::BaseFab< amrex::GpuComplex< amrex::Real > > ;
+        SpectralField tmp_rho_fft(c_local_box, 1, amrex::The_Device_Arena());
+        SpectralField tmp_G_fft(c_local_box, 1, amrex::The_Device_Arena());
+        tmp_rho_fft.shift(realspace_box.smallEnd());
+        tmp_G_fft.shift(realspace_box.smallEnd());
 
-    // Multiply tmp_G_fft and tmp_rho_fft in spectral space
-    // Store the result in-place in Gtmp_G_fft, to save memory
-    amrex::Multiply( tmp_G_fft, tmp_rho_fft, 0, 0, 1, 0);
+        // Create FFT plans
+        BL_PROFILE_VAR_START(timer_plans);
+#if !defined(ABLASTR_USE_HEFFTE)
+        const amrex::IntVect fft_size = realspace_ba[local_boxid].length();
+        ablastr::math::anyfft::FFTplan forward_plan_rho = ablastr::math::anyfft::CreatePlan(
+            fft_size, tmp_rho[local_boxid].dataPtr(),
+            reinterpret_cast<ablastr::math::anyfft::Complex*>(tmp_rho_fft.dataPtr()),
+            ablastr::math::anyfft::direction::R2C, AMREX_SPACEDIM);
+        ablastr::math::anyfft::FFTplan forward_plan_G = ablastr::math::anyfft::CreatePlan(
+            fft_size, tmp_G[local_boxid].dataPtr(),
+            reinterpret_cast<ablastr::math::anyfft::Complex*>(tmp_G_fft.dataPtr()),
+            ablastr::math::anyfft::direction::R2C, AMREX_SPACEDIM);
+        ablastr::math::anyfft::FFTplan backward_plan = ablastr::math::anyfft::CreatePlan(
+            fft_size, tmp_G[local_boxid].dataPtr(),
+            reinterpret_cast<ablastr::math::anyfft::Complex*>( tmp_G_fft.dataPtr()),
+            ablastr::math::anyfft::direction::C2R, AMREX_SPACEDIM);
+#elif defined(ABLASTR_USE_HEFFTE)
+#if     defined(AMREX_USE_CUDA)
+        heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif   defined(AMREX_USE_HIP)
+        heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#else
+        heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+            ({{local_box.smallEnd(0), local_box.smallEnd(1), local_box.smallEnd(2)},
+            {local_box.bigEnd(0), local_box.bigEnd(1), local_box.bigEnd(2)}},
+            {{c_local_box.smallEnd(0), c_local_box.smallEnd(1), c_local_box.smallEnd(2)},
+            {c_local_box.bigEnd(0), c_local_box.bigEnd(1), c_local_box.bigEnd(2)}},
+            0, amrex::ParallelDescriptor::Communicator());
+        using heffte_complex = typename heffte::fft_output<amrex::Real>::type;
+        heffte_complex* rho_fft_data = (heffte_complex*) tmp_rho_fft.dataPtr();
+        heffte_complex* G_fft_data = (heffte_complex*) tmp_G_fft.dataPtr();
+#endif
+        BL_PROFILE_VAR_STOP(timer_plans);
 
-    // Perform inverse FFT
-    auto backward_plan = ablastr::math::anyfft::FFTplans(spectralspace_ba, dm_global_fft);
-    // Loop over boxes perform FFTs
-    for ( amrex::MFIter mfi(spectralspace_ba, dm_global_fft); mfi.isValid(); ++mfi ){
+        // Perform forward FFTs
+        BL_PROFILE_VAR_START(timer_ffts);
+#if !defined(ABLASTR_USE_HEFFTE)
+        ablastr::math::anyfft::Execute(forward_plan_rho);
+        ablastr::math::anyfft::Execute(forward_plan_G);
+#elif defined(ABLASTR_USE_HEFFTE)
+        fft.forward(tmp_rho[local_boxid].dataPtr(), rho_fft_data);
+        fft.forward(tmp_G[local_boxid].dataPtr(), G_fft_data);
+#endif
+        BL_PROFILE_VAR_STOP(timer_ffts);
 
-        // Note: the size of the real-space box and spectral-space box
-        // differ when using real-to-complex FFT. When initializing
-        // the FFT plan, the valid dimensions are those of the real-space box.
-        const amrex::IntVect fft_size = realspace_ba[mfi].length();
+        // Multiply tmp_G_fft and tmp_rho_fft in spectral space
+        // Store the result in-place in Gtmp_G_fft, to save memory
+        tmp_G_fft.template mult<amrex::RunOn::Device>(tmp_rho_fft, 0, 0, 1);
+        amrex::Gpu::streamSynchronize();
 
-        // Inverse FFT: is done in-place, in the array of G
-        backward_plan[mfi] = ablastr::math::anyfft::CreatePlan(
-            fft_size, tmp_G[mfi].dataPtr(),
-            reinterpret_cast<ablastr::math::anyfft::Complex*>( tmp_G_fft[mfi].dataPtr()),
-            ablastr::math::anyfft::direction::C2R, AMREX_SPACEDIM);
-        ablastr::math::anyfft::Execute(backward_plan[mfi]);
+        // Perform backward FFT
+        BL_PROFILE_VAR_START(timer_ffts);
+#if !defined(ABLASTR_USE_HEFFTE)
+        ablastr::math::anyfft::Execute(backward_plan);
+#elif defined(ABLASTR_USE_HEFFTE)
+        fft.backward(G_fft_data, tmp_G[local_boxid].dataPtr());
+#endif
+        BL_PROFILE_VAR_STOP(timer_ffts);
+
+#if !defined(ABLASTR_USE_HEFFTE)
+        // Loop to destroy FFT plans
+        ablastr::math::anyfft::DestroyPlan(forward_plan_G);
+        ablastr::math::anyfft::DestroyPlan(forward_plan_rho);
+        ablastr::math::anyfft::DestroyPlan(backward_plan);
+#endif
     }
-    // Normalize, since (FFT + inverse FFT) results in a factor N
+
+     // Normalize, since (FFT + inverse FFT) results in a factor N
     const amrex::Real normalization = 1._rt / realspace_box.numPts();
     tmp_G.mult( normalization );
 
+    BL_PROFILE_VAR_START(timer_pcopies);
     // Copy from tmp_G to phi
-    phi.ParallelCopy( tmp_G, 0, 0, 1, amrex::IntVect::TheZeroVector(), phi.nGrowVect() );
-
-    // Loop to destroy FFT plans
-    for ( amrex::MFIter mfi(spectralspace_ba, dm_global_fft); mfi.isValid(); ++mfi ){
-        ablastr::math::anyfft::DestroyPlan(forward_plan_G[mfi]);
-        ablastr::math::anyfft::DestroyPlan(forward_plan_rho[mfi]);
-        ablastr::math::anyfft::DestroyPlan(backward_plan[mfi]);
-    }
+    phi.ParallelCopy( tmp_G, 0, 0, 1, amrex::IntVect::TheZeroVector(), phi.nGrowVect());
+    BL_PROFILE_VAR_STOP(timer_pcopies);
 }
 } // namespace ablastr::fields