diff --git a/src/core/electrostatics/coulomb.cpp b/src/core/electrostatics/coulomb.cpp
index 14282a8b17..cd5dfa8766 100644
--- a/src/core/electrostatics/coulomb.cpp
+++ b/src/core/electrostatics/coulomb.cpp
@@ -145,7 +145,7 @@ Solver::calc_pressure_long_range(ParticleRange const &particles) const {
 struct ShortRangeCutoff {
 #ifdef P3M
   auto operator()(std::shared_ptr<CoulombP3M> const &actor) const {
-    return actor->p3m.params.r_cut;
+    return actor->p3m_params.r_cut;
   }
   auto
   operator()(std::shared_ptr<ElectrostaticLayerCorrection> const &actor) const {
diff --git a/src/core/electrostatics/elc.cpp b/src/core/electrostatics/elc.cpp
index 28f4f5717b..a7988e84db 100644
--- a/src/core/electrostatics/elc.cpp
+++ b/src/core/electrostatics/elc.cpp
@@ -1011,7 +1011,8 @@ void ElectrostaticLayerCorrection::adapt_solver() {
   std::visit(
       [this](auto &solver) {
         set_prefactor(solver->prefactor);
-        solver->p3m.params.epsilon = P3M_EPSILON_METALLIC;
+        solver->adapt_epsilon_elc();
+        assert(solver->p3m_params.epsilon == P3M_EPSILON_METALLIC);
       },
       base_solver);
 }
@@ -1031,7 +1032,7 @@ void ElectrostaticLayerCorrection::recalc_box_h() {
 void ElectrostaticLayerCorrection::recalc_space_layer() {
   if (elc.dielectric_contrast_on) {
     auto const p3m_r_cut = std::visit(
-        [](auto &solver) { return solver->p3m.params.r_cut; }, base_solver);
+        [](auto &solver) { return solver->p3m_params.r_cut; }, base_solver);
     // recalculate the space layer size:
     // 1. set the space_layer to be 1/3 of the gap size, so that box = layer
     elc.space_layer = (1. / 3.) * elc.gap_size;
diff --git a/src/core/electrostatics/p3m.cpp b/src/core/electrostatics/p3m.cpp
index 2475b4954c..08cef7bd74 100644
--- a/src/core/electrostatics/p3m.cpp
+++ b/src/core/electrostatics/p3m.cpp
@@ -90,6 +90,10 @@
 #include <tuple>
 #include <utility>
 
+#ifdef FFTW3_H
+#error "The FFTW3 library shouldn't be visible in this translation unit"
+#endif
+
 template <typename FloatType, Arch Architecture>
 void CoulombP3MImpl<FloatType, Architecture>::count_charged_particles() {
   auto local_n = 0;
@@ -262,8 +266,11 @@ void CoulombP3MImpl<FloatType, Architecture>::init_cpu_kernels() {
 
   assert(p3m.fft);
   p3m.local_mesh.calc_local_ca_mesh(p3m.params, local_geo, skin, elc_layer);
-  p3m.fft->init_halo();
-  p3m.fft->init_fft();
+  p3m.fft_buffers->init_halo();
+  p3m.fft->init(p3m.params);
+  p3m.mesh.ks_pnum = p3m.fft->get_ks_pnum();
+  p3m.fft_buffers->init_meshes(p3m.fft->get_ca_mesh_size());
+  p3m.update_mesh_views();
   p3m.calc_differential_operator();
 
   /* fix box length dependent constants */
@@ -391,8 +398,9 @@ Utils::Vector9d CoulombP3MImpl<FloatType, Architecture>::long_range_pressure(
 
   if (p3m.sum_q2 > 0.) {
     charge_assign(particles);
-    p3m.fft->perform_scalar_halo_gather();
-    p3m.fft->perform_scalar_fwd_fft();
+    p3m.fft_buffers->perform_scalar_halo_gather();
+    p3m.fft->forward_fft(p3m.fft_buffers->get_scalar_mesh());
+    p3m.update_mesh_views();
 
     auto constexpr mesh_start = Utils::Vector3i::broadcast(0);
     auto const &mesh_stop = p3m.mesh.size;
@@ -457,8 +465,9 @@ double CoulombP3MImpl<FloatType, Architecture>::long_range_kernel(
             system.coulomb.impl->solver)) {
       charge_assign(particles);
     }
-    p3m.fft->perform_scalar_halo_gather();
-    p3m.fft->perform_scalar_fwd_fft();
+    p3m.fft_buffers->perform_scalar_halo_gather();
+    p3m.fft->forward_fft(p3m.fft_buffers->get_scalar_mesh());
+    p3m.update_mesh_views();
   }
 
   auto p_q_range = ParticlePropertyRange::charge_range(particles);
@@ -515,8 +524,10 @@ double CoulombP3MImpl<FloatType, Architecture>::long_range_kernel(
     auto const check_residuals =
         not p3m.params.tuning and check_complex_residuals;
     p3m.fft->check_complex_residuals = check_residuals;
-    p3m.fft->perform_vector_back_fft();
-    p3m.fft->perform_vector_halo_spread();
+    for (auto &rs_mesh : p3m.fft_buffers->get_vector_mesh()) {
+      p3m.fft->backward_fft(rs_mesh);
+    }
+    p3m.fft_buffers->perform_vector_halo_spread();
     p3m.fft->check_complex_residuals = false;
 
     auto const force_prefac = prefactor / volume;
@@ -785,25 +796,25 @@ void CoulombP3M::sanity_checks_boxl() const {
   auto const &local_geo = *system.local_geo;
   for (auto i = 0u; i < 3u; i++) {
     /* check k-space cutoff */
-    if (p3m.params.cao_cut[i] >= box_geo.length_half()[i]) {
+    if (p3m_params.cao_cut[i] >= box_geo.length_half()[i]) {
       std::stringstream msg;
-      msg << "P3M_init: k-space cutoff " << p3m.params.cao_cut[i]
+      msg << "P3M_init: k-space cutoff " << p3m_params.cao_cut[i]
           << " is larger than half of box dimension " << box_geo.length()[i];
       throw std::runtime_error(msg.str());
     }
-    if (p3m.params.cao_cut[i] >= local_geo.length()[i]) {
+    if (p3m_params.cao_cut[i] >= local_geo.length()[i]) {
       std::stringstream msg;
-      msg << "P3M_init: k-space cutoff " << p3m.params.cao_cut[i]
+      msg << "P3M_init: k-space cutoff " << p3m_params.cao_cut[i]
           << " is larger than local box dimension " << local_geo.length()[i];
       throw std::runtime_error(msg.str());
     }
   }
 
-  if (p3m.params.epsilon != P3M_EPSILON_METALLIC) {
+  if (p3m_params.epsilon != P3M_EPSILON_METALLIC) {
     if ((box_geo.length()[0] != box_geo.length()[1]) or
         (box_geo.length()[1] != box_geo.length()[2]) or
-        (p3m.params.mesh[0] != p3m.params.mesh[1]) or
-        (p3m.params.mesh[1] != p3m.params.mesh[2])) {
+        (p3m_params.mesh[0] != p3m_params.mesh[1]) or
+        (p3m_params.mesh[1] != p3m_params.mesh[2])) {
       throw std::runtime_error(
           "CoulombP3M: non-metallic epsilon requires cubic box");
     }
@@ -841,7 +852,8 @@ void CoulombP3M::sanity_checks_node_grid() const {
   }
 }
 
-void CoulombP3M::scaleby_box_l() {
+template <typename FloatType, Arch Architecture>
+void CoulombP3MImpl<FloatType, Architecture>::scaleby_box_l() {
   auto const &box_geo = *get_system().box_geo;
   p3m.params.r_cut = p3m.params.r_cut_iL * box_geo.length()[0];
   p3m.params.alpha = p3m.params.alpha_L * box_geo.length_inv()[0];
diff --git a/src/core/electrostatics/p3m.hpp b/src/core/electrostatics/p3m.hpp
index f7d7fb1995..773e2da83b 100644
--- a/src/core/electrostatics/p3m.hpp
+++ b/src/core/electrostatics/p3m.hpp
@@ -50,38 +50,19 @@
 
 #include <cmath>
 #include <numbers>
-#include <stdexcept>
 
 /** @brief P3M solver. */
 struct CoulombP3M : public Coulomb::Actor<CoulombP3M> {
-  /** @brief Coulomb P3M parameters. */
-  p3m_data_struct &p3m;
-
-  int tune_timings;
-  bool tune_verbose;
-  bool check_complex_residuals;
-
-protected:
-  bool m_is_tuned;
+  P3MParameters const &p3m_params;
 
 public:
-  CoulombP3M(p3m_data_struct &p3m_data, double prefactor, int tune_timings,
-             bool tune_verbose, bool check_complex_residuals)
-      : p3m{p3m_data}, tune_timings{tune_timings}, tune_verbose{tune_verbose},
-        check_complex_residuals{check_complex_residuals} {
-
-    if (tune_timings <= 0) {
-      throw std::domain_error("Parameter 'timings' must be > 0");
-    }
-    m_is_tuned = not p3m.params.tuning;
-    p3m.params.tuning = false;
-    set_prefactor(prefactor);
-  }
+  CoulombP3M(P3MParameters const &p3m_params) : p3m_params{p3m_params} {}
 
   virtual ~CoulombP3M() = default;
 
-  [[nodiscard]] bool is_tuned() const { return m_is_tuned; }
+  [[nodiscard]] virtual bool is_tuned() const noexcept = 0;
   [[nodiscard]] virtual bool is_gpu() const noexcept = 0;
+  [[nodiscard]] virtual bool is_double_precision() const noexcept = 0;
 
   /** @brief Recalculate all derived parameters. */
   virtual void init() = 0;
@@ -109,6 +90,7 @@ struct CoulombP3M : public Coulomb::Actor<CoulombP3M> {
    */
   virtual void count_charged_particles() = 0;
   virtual void count_charged_particles_elc(int, double, double) = 0;
+  virtual void adapt_epsilon_elc() = 0;
 
   /**
    * @brief Tune P3M parameters to desired accuracy.
@@ -119,7 +101,7 @@ struct CoulombP3M : public Coulomb::Actor<CoulombP3M> {
    * @ref P3MParameters::r_cut_iL "r_cut_iL" and
    * @ref P3MParameters::alpha_L "alpha_L" are tuned to obtain the target
    * @ref P3MParameters::accuracy "accuracy" in optimal time.
-   * These parameters are stored in the @ref p3m object.
+   * These parameters are stored in the @ref p3m_params object.
    *
    * The function utilizes the analytic expression of the error estimate
    * for the P3M method in @cite hockney88a (eq. (8.23)) in
@@ -163,10 +145,10 @@ struct CoulombP3M : public Coulomb::Actor<CoulombP3M> {
   /** Calculate real-space contribution of p3m Coulomb pair forces. */
   Utils::Vector3d pair_force(double q1q2, Utils::Vector3d const &d,
                              double dist) const {
-    if ((q1q2 == 0.) || dist >= p3m.params.r_cut || dist <= 0.) {
+    if ((q1q2 == 0.) || dist >= p3m_params.r_cut || dist <= 0.) {
       return {};
     }
-    auto const alpha = p3m.params.alpha;
+    auto const alpha = p3m_params.alpha;
     auto const adist = alpha * dist;
     auto const exp_adist_sq = exp(-adist * adist);
     auto const dist_sq = dist * dist;
@@ -184,10 +166,10 @@ struct CoulombP3M : public Coulomb::Actor<CoulombP3M> {
   /** Calculate real-space contribution of Coulomb pair energy. */
   // Eq. (3.6) @cite deserno00b
   double pair_energy(double q1q2, double dist) const {
-    if ((q1q2 == 0.) || dist >= p3m.params.r_cut || dist <= 0.) {
+    if ((q1q2 == 0.) || dist >= p3m_params.r_cut || dist <= 0.) {
       return {};
     }
-    auto const adist = p3m.params.alpha * dist;
+    auto const adist = p3m_params.alpha * dist;
 #if USE_ERFC_APPROXIMATION
     auto const erfc_part_ri = Utils::AS_erfc_part(adist) / dist;
     return prefactor * q1q2 * erfc_part_ri * exp(-adist * adist);
@@ -216,7 +198,7 @@ struct CoulombP3M : public Coulomb::Actor<CoulombP3M> {
   void sanity_checks_periodicity() const;
   void sanity_checks_cell_structure() const;
 
-  virtual void scaleby_box_l();
+  virtual void scaleby_box_l() = 0;
 };
 
 #endif // P3M
diff --git a/src/core/electrostatics/p3m.impl.hpp b/src/core/electrostatics/p3m.impl.hpp
index 7ac688ab30..8f8fb6d92f 100644
--- a/src/core/electrostatics/p3m.impl.hpp
+++ b/src/core/electrostatics/p3m.impl.hpp
@@ -19,19 +19,6 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-/** @file
- *  P3M algorithm for long-range Coulomb interaction.
- *
- *  We use a P3M (Particle-Particle Particle-Mesh) method based on the
- *  Ewald summation. Details of the used method can be found in
- *  @cite hockney88a and @cite deserno98a @cite deserno98b.
- *
- *  Further reading: @cite ewald21a, @cite hockney88a, @cite deserno98a,
- *  @cite deserno98b, @cite deserno00e, @cite deserno00b, @cite cerda08d.
- *
- *  Implementation in p3m.cpp.
- */
-
 #pragma once
 
 #include "config/config.hpp"
@@ -49,11 +36,13 @@
 #include <utils/Vector.hpp>
 
 #include <memory>
+#include <stdexcept>
+#include <type_traits>
 #include <utility>
 
 template <typename FloatType>
-struct p3m_data_struct_coulomb : public p3m_data_struct_fft<FloatType> {
-  using p3m_data_struct_fft<FloatType>::p3m_data_struct_fft;
+struct p3m_data_struct_coulomb : public p3m_data_struct<FloatType> {
+  using p3m_data_struct<FloatType>::p3m_data_struct;
 
   /** number of charged particles (only on head node). */
   int sum_qpart = 0;
@@ -73,17 +62,34 @@ template <typename FloatType, Arch Architecture>
 struct CoulombP3MImpl : public CoulombP3M {
   ~CoulombP3MImpl() override = default;
 
+  /** @brief Coulomb P3M parameters. */
+  p3m_data_struct_coulomb<FloatType> &p3m;
+
+private:
   /** @brief Coulomb P3M meshes and FFT algorithm. */
   std::unique_ptr<p3m_data_struct_coulomb<FloatType>> p3m_impl;
-  // this member overshadows its own base class pointer in the parent class
-  p3m_data_struct_coulomb<FloatType> &p3m;
+  int tune_timings;
+  bool tune_verbose;
+  bool check_complex_residuals;
+  bool m_is_tuned;
 
-  template <typename... Args>
+public:
   CoulombP3MImpl(
       std::unique_ptr<p3m_data_struct_coulomb<FloatType>> &&p3m_handle,
-      Args &&...args)
-      : CoulombP3M(*p3m_handle, std::forward<Args>(args)...),
-        p3m_impl{std::move(p3m_handle)}, p3m{*p3m_impl} {}
+      double prefactor, int tune_timings, bool tune_verbose,
+      bool check_complex_residuals)
+      : CoulombP3M(p3m_handle->params), p3m{*p3m_handle},
+        p3m_impl{std::move(p3m_handle)}, tune_timings{tune_timings},
+        tune_verbose{tune_verbose},
+        check_complex_residuals{check_complex_residuals} {
+
+    if (tune_timings <= 0) {
+      throw std::domain_error("Parameter 'timings' must be > 0");
+    }
+    m_is_tuned = not p3m.params.tuning;
+    p3m.params.tuning = false;
+    set_prefactor(prefactor);
+  }
 
   void init() override {
     if constexpr (Architecture == Arch::CPU) {
@@ -103,10 +109,17 @@ struct CoulombP3MImpl : public CoulombP3M {
     p3m.sum_q2 = sum_q2;
     p3m.square_sum_q = square_sum_q;
   }
+  void adapt_epsilon_elc() override {
+    p3m.params.epsilon = P3M_EPSILON_METALLIC;
+  }
 
+  [[nodiscard]] bool is_tuned() const noexcept override { return m_is_tuned; }
   [[nodiscard]] bool is_gpu() const noexcept override {
     return Architecture == Arch::GPU;
   }
+  [[nodiscard]] bool is_double_precision() const noexcept override {
+    return std::is_same_v<FloatType, double>;
+  }
 
   void on_activation() override {
 #ifdef CUDA
@@ -160,6 +173,7 @@ struct CoulombP3MImpl : public CoulombP3M {
                            ParticleRange const &particles);
   void calc_influence_function_force() override;
   void calc_influence_function_energy() override;
+  void scaleby_box_l() override;
   void init_cpu_kernels();
 #ifdef CUDA
   void init_gpu_kernels();
@@ -170,13 +184,15 @@ struct CoulombP3MImpl : public CoulombP3M {
 };
 
 template <typename FloatType, Arch Architecture,
-          template <typename> class FFTBackendImpl, class... Args>
+          template <typename> class FFTBackendImpl,
+          template <typename> class P3MFFTMeshImpl, class... Args>
 std::shared_ptr<CoulombP3M> new_p3m_handle(P3MParameters &&p3m,
                                            Args &&...args) {
   auto obj = std::make_shared<CoulombP3MImpl<FloatType, Architecture>>(
       std::make_unique<p3m_data_struct_coulomb<FloatType>>(std::move(p3m)),
       std::forward<Args>(args)...);
-  obj->p3m.template make_fft_instance<FFTBackendImpl<FloatType>>(false);
+  obj->p3m.template make_mesh_instance<P3MFFTMeshImpl<FloatType>>();
+  obj->p3m.template make_fft_instance<FFTBackendImpl<FloatType>>();
   return obj;
 }
 
diff --git a/src/core/fft/fft.cpp b/src/core/fft/fft.cpp
index 9dad7b9aa5..89b05bc386 100644
--- a/src/core/fft/fft.cpp
+++ b/src/core/fft/fft.cpp
@@ -28,6 +28,8 @@
 #include "fft.hpp"
 #include "vector.hpp"
 
+#include "p3m/packing.hpp"
+
 #include <utils/Vector.hpp>
 #include <utils/index.hpp>
 #include <utils/math/permute_ifield.hpp>
@@ -394,7 +396,7 @@ void fft_data_struct<FloatType>::forw_grid_comm(
     FloatType const *in, FloatType *out) {
   for (std::size_t i = 0ul; i < plan.group.size(); i++) {
     plan.pack_function(in, send_buf.data(), &(plan.send_block[6ul * i]),
-                       &(plan.send_block[6ul * i + 3ul]), plan.old_mesh,
+                       &(plan.send_block[6ul * i + 3ul]), plan.old_mesh.data(),
                        plan.element);
 
     if (plan.group[i] != comm.rank()) {
@@ -405,7 +407,7 @@ void fft_data_struct<FloatType>::forw_grid_comm(
       std::swap(send_buf, recv_buf);
     }
     fft_unpack_block(recv_buf.data(), out, &(plan.recv_block[6ul * i]),
-                     &(plan.recv_block[6ul * i + 3ul]), plan.new_mesh,
+                     &(plan.recv_block[6ul * i + 3ul]), plan.new_mesh.data(),
                      plan.element);
   }
 }
@@ -429,8 +431,8 @@ void fft_data_struct<FloatType>::back_grid_comm(
 
   for (std::size_t i = 0ul; i < plan_f.group.size(); i++) {
     plan_b.pack_function(in, send_buf.data(), &(plan_f.recv_block[6ul * i]),
-                         &(plan_f.recv_block[6ul * i + 3ul]), plan_f.new_mesh,
-                         plan_f.element);
+                         &(plan_f.recv_block[6ul * i + 3ul]),
+                         plan_f.new_mesh.data(), plan_f.element);
 
     if (plan_f.group[i] != comm.rank()) { /* send first, receive second */
       fft_sendrecv(send_buf.data(), plan_f.recv_size[i], plan_f.group[i],
@@ -440,8 +442,8 @@ void fft_data_struct<FloatType>::back_grid_comm(
       std::swap(send_buf, recv_buf);
     }
     fft_unpack_block(recv_buf.data(), out, &(plan_f.send_block[6ul * i]),
-                     &(plan_f.send_block[6ul * i + 3ul]), plan_f.old_mesh,
-                     plan_f.element);
+                     &(plan_f.send_block[6ul * i + 3ul]),
+                     plan_f.old_mesh.data(), plan_f.element);
   }
 }
 
@@ -596,9 +598,9 @@ int fft_data_struct<FloatType>::initialize_fft(
 
     forw[i].new_size = calc_local_mesh(
         my_pos[i], n_grid[i], global_mesh_dim.data(), global_mesh_off.data(),
-        forw[i].new_mesh, forw[i].start);
-    permute_ifield(forw[i].new_mesh, 3, -(forw[i].n_permute));
-    permute_ifield(forw[i].start, 3, -(forw[i].n_permute));
+        forw[i].new_mesh.data(), forw[i].start.data());
+    permute_ifield(forw[i].new_mesh.data(), 3, -(forw[i].n_permute));
+    permute_ifield(forw[i].start.data(), 3, -(forw[i].n_permute));
     forw[i].n_ffts = forw[i].new_mesh[0] * forw[i].new_mesh[1];
 
     /* === send/recv block specifications === */
@@ -782,58 +784,6 @@ void fft_data_struct<FloatType>::backward_fft(
   /* REMARK: Result has to be in data. */
 }
 
-template <typename FloatType>
-void fft_pack_block(FloatType const *const in, FloatType *const out,
-                    int const start[3], int const size[3], int const dim[3],
-                    int element) {
-
-  auto const copy_size =
-      static_cast<std::size_t>(element * size[2]) * sizeof(FloatType);
-  /* offsets for indices in input grid */
-  auto const m_in_offset = element * dim[2];
-  auto const s_in_offset = element * (dim[2] * (dim[1] - size[1]));
-  /* offsets for indices in output grid */
-  auto const m_out_offset = element * size[2];
-  /* linear index of input grid, linear index of output grid */
-  int li_in = element * (start[2] + dim[2] * (start[1] + dim[1] * start[0]));
-  int li_out = 0;
-
-  for (int s = 0; s < size[0]; s++) {
-    for (int m = 0; m < size[1]; m++) {
-      memmove(&(out[li_out]), &(in[li_in]), copy_size);
-      li_in += m_in_offset;
-      li_out += m_out_offset;
-    }
-    li_in += s_in_offset;
-  }
-}
-
-template <typename FloatType>
-void fft_unpack_block(FloatType const *const in, FloatType *const out,
-                      int const start[3], int const size[3], int const dim[3],
-                      int element) {
-
-  auto const copy_size =
-      static_cast<std::size_t>(element * size[2]) * sizeof(FloatType);
-  /* offsets for indices in output grid */
-  auto const m_out_offset = element * dim[2];
-  auto const s_out_offset = element * (dim[2] * (dim[1] - size[1]));
-  /* offset for indices in input grid */
-  auto const m_in_offset = element * size[2];
-  /* linear index of in grid, linear index of out grid */
-  int li_in = 0;
-  int li_out = element * (start[2] + dim[2] * (start[1] + dim[1] * start[0]));
-
-  for (int s = 0; s < size[0]; s++) {
-    for (int m = 0; m < size[1]; m++) {
-      memmove(&(out[li_out]), &(in[li_in]), copy_size);
-      li_in += m_in_offset;
-      li_out += m_out_offset;
-    }
-    li_out += s_out_offset;
-  }
-}
-
 template <typename FloatType> void fft_plan<FloatType>::destroy_plan() {
   if (plan_handle) {
     fftw<FloatType>::destroy_plan(plan_handle);
@@ -877,17 +827,4 @@ template struct fft_back_plan<double>;
 template struct fft_data_struct<float>;
 template struct fft_data_struct<double>;
 
-template void fft_pack_block<float>(float const *in, float *out,
-                                    int const start[3], int const size[3],
-                                    int const dim[3], int element);
-template void fft_pack_block<double>(double const *in, double *out,
-                                     int const start[3], int const size[3],
-                                     int const dim[3], int element);
-template void fft_unpack_block<float>(float const *in, float *out,
-                                      int const start[3], int const size[3],
-                                      int const dim[3], int element);
-template void fft_unpack_block<double>(double const *in, double *out,
-                                       int const start[3], int const size[3],
-                                       int const dim[3], int element);
-
 } // namespace fft
diff --git a/src/core/fft/fft.hpp b/src/core/fft/fft.hpp
index 5c3afe58d1..a462a958c3 100644
--- a/src/core/fft/fft.hpp
+++ b/src/core/fft/fft.hpp
@@ -88,11 +88,11 @@ struct fft_forw_plan : public fft_plan<FloatType> {
   int n_ffts;
 
   /** size of local mesh before communication. */
-  int old_mesh[3];
+  std::array<int, 3u> old_mesh;
   /** size of local mesh after communication, also used for actual FFT. */
-  int new_mesh[3];
+  std::array<int, 3u> new_mesh;
   /** lower left point of local FFT mesh in global FFT mesh coordinates. */
-  int start[3];
+  std::array<int, 3u> start;
   /** size of new mesh (number of mesh points). */
   int new_size;
 
@@ -197,9 +197,9 @@ template <typename FloatType> struct fft_data_struct {
   void backward_fft(boost::mpi::communicator const &comm, FloatType *data,
                     bool check_complex);
 
-  auto get_mesh_size() const { return forw[3u].new_mesh; }
+  auto const &get_mesh_size() const { return forw[3u].new_mesh; }
 
-  auto get_mesh_start() const { return forw[3u].start; }
+  auto const &get_mesh_start() const { return forw[3u].start; }
 
 private:
   void forw_grid_comm(boost::mpi::communicator const &comm,
@@ -211,43 +211,6 @@ template <typename FloatType> struct fft_data_struct {
                       FloatType const *in, FloatType *out);
 };
 
-/** Pack a block (<tt>size[3]</tt> starting at <tt>start[3]</tt>) of an input
- *  3d-grid with dimension <tt>dim[3]</tt> into an output 3d-block with
- *  dimension <tt>size[3]</tt>.
- *
- *  The block with dimensions <tt>(size[0], size[1], size[2])</tt> is stored
- *  in 'row-major-order' or 'C-order', that means the first index is
- *  changing slowest when running through the linear array. The
- *  element (@c i0 (slow), @c i1 (mid), @c i2 (fast)) has the linear index
- *  <tt>li = i2 + size[2] * (i1 + (size[1] * i0))</tt>.
- *
- *  \param[in]  in      pointer to input 3d-grid.
- *  \param[out] out     pointer to output 3d-grid (block).
- *  \param[in]  start   start index of the block in the in-grid.
- *  \param[in]  size    size of the block (=dimension of the out-grid).
- *  \param[in]  dim     size of the in-grid.
- *  \param[in]  element size of a grid element (e.g. 1 for Real, 2 for Complex).
- */
-template <typename FloatType>
-void fft_pack_block(FloatType const *in, FloatType *out, int const start[3],
-                    int const size[3], int const dim[3], int element);
-
-/** Unpack a 3d-grid input block (<tt>size[3]</tt>) into an output 3d-grid
- *  with dimension <tt>dim[3]</tt> at start position <tt>start[3]</tt>.
- *
- *  see also \ref fft_pack_block.
- *
- *  \param[in]  in      pointer to input 3d-grid.
- *  \param[out] out     pointer to output 3d-grid (block).
- *  \param[in]  start   start index of the block in the in-grid.
- *  \param[in]  size    size of the block (=dimension of the out-grid).
- *  \param[in]  dim     size of the in-grid.
- *  \param[in]  element size of a grid element (e.g. 1 for Real, 2 for Complex).
- */
-template <typename FloatType>
-void fft_unpack_block(FloatType const *in, FloatType *out, int const start[3],
-                      int const size[3], int const dim[3], int element);
-
 int map_3don2d_grid(int const g3d[3], int g2d[3]);
 
 std::optional<std::vector<int>> find_comm_groups(Utils::Vector3i const &,
diff --git a/src/core/magnetostatics/dipoles.cpp b/src/core/magnetostatics/dipoles.cpp
index 5b9e3c98a5..d29a1ac714 100644
--- a/src/core/magnetostatics/dipoles.cpp
+++ b/src/core/magnetostatics/dipoles.cpp
@@ -91,7 +91,7 @@ double Solver::cutoff() const {
 #ifdef DP3M
   if (impl->solver) {
     if (auto dp3m = get_actor_by_type<DipolarP3M>(impl->solver)) {
-      return dp3m->dp3m.params.r_cut;
+      return dp3m->dp3m_params.r_cut;
     }
   }
 #endif
diff --git a/src/core/magnetostatics/dlc.cpp b/src/core/magnetostatics/dlc.cpp
index e327a11d0f..15d820d3c8 100644
--- a/src/core/magnetostatics/dlc.cpp
+++ b/src/core/magnetostatics/dlc.cpp
@@ -465,7 +465,7 @@ struct AdaptSolver {
 #ifdef DP3M
   void operator()(std::shared_ptr<DipolarP3M> const &solver) {
     m_actor->prefactor = solver->prefactor;
-    m_actor->epsilon = solver->dp3m.params.epsilon;
+    m_actor->epsilon = solver->dp3m_params.epsilon;
   }
 #endif
 
diff --git a/src/core/magnetostatics/dp3m.cpp b/src/core/magnetostatics/dp3m.cpp
index 68e409b316..8855a8e4d8 100644
--- a/src/core/magnetostatics/dp3m.cpp
+++ b/src/core/magnetostatics/dp3m.cpp
@@ -76,6 +76,10 @@
 #include <tuple>
 #include <vector>
 
+#ifdef FFTW3_H
+#error "The FFTW3 library shouldn't be visible in this translation unit"
+#endif
+
 template <typename FloatType, Arch Architecture>
 void DipolarP3MImpl<FloatType, Architecture>::count_magnetic_particles() {
   int local_n = 0;
@@ -136,8 +140,11 @@ void DipolarP3MImpl<FloatType, Architecture>::init_cpu_kernels() {
 
   assert(dp3m.fft);
   dp3m.local_mesh.calc_local_ca_mesh(dp3m.params, local_geo, verlet_skin, 0.);
-  dp3m.fft->init_halo();
-  dp3m.fft->init_fft();
+  dp3m.fft_buffers->init_halo();
+  dp3m.fft->init(dp3m.params);
+  dp3m.mesh.ks_pnum = dp3m.fft->get_ks_pnum();
+  dp3m.fft_buffers->init_meshes(dp3m.fft->get_ca_mesh_size());
+  dp3m.update_mesh_views();
   dp3m.calc_differential_operator();
 
   /* fix box length dependent constants */
@@ -253,8 +260,11 @@ double DipolarP3MImpl<FloatType, Architecture>::long_range_kernel(
 
   if (dp3m.sum_mu2 > 0.) {
     dipole_assign(particles);
-    dp3m.fft->perform_vector_halo_gather();
-    dp3m.fft->perform_vector_fwd_fft();
+    dp3m.fft_buffers->perform_vector_halo_gather();
+    for (auto &rs_mesh : dp3m.fft_buffers->get_vector_mesh()) {
+      dp3m.fft->forward_fft(rs_mesh);
+    }
+    dp3m.update_mesh_views();
   }
 
   /* === k-space energy calculation  === */
@@ -323,8 +333,9 @@ double DipolarP3MImpl<FloatType, Architecture>::long_range_kernel(
       auto &mesh_dip = dp3m.mesh.rs_fields;
       auto indices = Utils::Vector3i{};
       auto index = std::size_t(0u);
+      dp3m.ks_scalar.resize(dp3m.mesh.rs_scalar.size());
 
-      /* fill in mesh.ks_scalar array for torque calculation */
+      /* fill in ks_scalar array for torque calculation */
       auto it_energy = dp3m.g_energy.begin();
       index = 0u;
       for_each_3d(mesh_start, mesh_stop, indices, [&]() {
@@ -333,13 +344,13 @@ double DipolarP3MImpl<FloatType, Architecture>::long_range_kernel(
         auto const re = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
                         mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
                         mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
-        dp3m.mesh.ks_scalar[index] = *it_energy * re;
+        dp3m.ks_scalar[index] = *it_energy * re;
         ++index;
         // Im(mu)*k
         auto const im = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
                         mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
                         mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
-        dp3m.mesh.ks_scalar[index] = *it_energy * im;
+        dp3m.ks_scalar[index] = *it_energy * im;
         ++index;
         std::advance(it_energy, 1);
       });
@@ -349,13 +360,13 @@ double DipolarP3MImpl<FloatType, Architecture>::long_range_kernel(
         index = 0u;
         for_each_3d(mesh_start, mesh_stop, indices, [&]() {
           auto const d_op_val = FloatType(d_op[indices[d] + offset[d]]);
-          dp3m.mesh.rs_scalar[index] = d_op_val * dp3m.mesh.ks_scalar[index];
+          dp3m.mesh.rs_scalar[index] = d_op_val * dp3m.ks_scalar[index];
           ++index;
-          dp3m.mesh.rs_scalar[index] = d_op_val * dp3m.mesh.ks_scalar[index];
+          dp3m.mesh.rs_scalar[index] = d_op_val * dp3m.ks_scalar[index];
           ++index;
         });
-        dp3m.fft->perform_scalar_back_fft();
-        dp3m.fft->perform_scalar_halo_spread();
+        dp3m.fft->backward_fft(dp3m.fft_buffers->get_scalar_mesh());
+        dp3m.fft_buffers->perform_scalar_halo_spread();
         /* Assign force component from mesh to particle */
         auto const d_rs = (d + dp3m.mesh.ks_pnum) % 3;
         Utils::integral_parameter<int, AssignTorques, 1, 7>(
@@ -369,7 +380,7 @@ double DipolarP3MImpl<FloatType, Architecture>::long_range_kernel(
       // grids dp3m.mesh.rs_fields !
       // Note: I'll do here 9 inverse FFTs. By symmetry, we can reduce this
       // number to 6 !
-      /* fill in mesh.ks_scalar array for force calculation */
+      /* fill in ks_scalar array for force calculation */
       auto it_force = dp3m.g_force.begin();
       index = 0u;
       for_each_3d(mesh_start, mesh_stop, indices, [&]() {
@@ -384,8 +395,8 @@ double DipolarP3MImpl<FloatType, Architecture>::long_range_kernel(
                         mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
                         mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
         ++index;
-        dp3m.mesh.ks_scalar[index - 2] = *it_force * im;
-        dp3m.mesh.ks_scalar[index - 1] = *it_force * (-re);
+        dp3m.ks_scalar[index - 2] = *it_force * im;
+        dp3m.ks_scalar[index - 1] = *it_force * (-re);
         std::advance(it_force, 1);
       });
 
@@ -395,19 +406,21 @@ double DipolarP3MImpl<FloatType, Architecture>::long_range_kernel(
         for_each_3d(mesh_start, mesh_stop, indices, [&]() {
           auto const d_op_val = FloatType(d_op[indices[d] + offset[d]]);
           auto const shift = indices + offset;
-          auto const f1 = d_op_val * dp3m.mesh.ks_scalar[index];
+          auto const f1 = d_op_val * dp3m.ks_scalar[index];
           mesh_dip[0u][index] = FloatType(d_op[shift[KX]]) * f1;
           mesh_dip[1u][index] = FloatType(d_op[shift[KY]]) * f1;
           mesh_dip[2u][index] = FloatType(d_op[shift[KZ]]) * f1;
           ++index;
-          auto const f2 = d_op_val * dp3m.mesh.ks_scalar[index];
+          auto const f2 = d_op_val * dp3m.ks_scalar[index];
           mesh_dip[0u][index] = FloatType(d_op[shift[KX]]) * f2;
           mesh_dip[1u][index] = FloatType(d_op[shift[KY]]) * f2;
           mesh_dip[2u][index] = FloatType(d_op[shift[KZ]]) * f2;
           ++index;
         });
-        dp3m.fft->perform_vector_back_fft();
-        dp3m.fft->perform_vector_halo_spread();
+        for (auto &rs_mesh : dp3m.fft_buffers->get_vector_mesh()) {
+          dp3m.fft->backward_fft(rs_mesh);
+        }
+        dp3m.fft_buffers->perform_vector_halo_spread();
         /* Assign force component from mesh to particle */
         auto const d_rs = (d + dp3m.mesh.ks_pnum) % 3;
         Utils::integral_parameter<int, AssignForces, 1, 7>(
@@ -435,8 +448,9 @@ double DipolarP3MImpl<FloatType, Architecture>::long_range_kernel(
   return energy;
 }
 
-double DipolarP3M::calc_surface_term(bool force_flag, bool energy_flag,
-                                     ParticleRange const &particles) {
+template <typename FloatType, Arch Architecture>
+double DipolarP3MImpl<FloatType, Architecture>::calc_surface_term(
+    bool force_flag, bool energy_flag, ParticleRange const &particles) {
   auto const &box_geo = *get_system().box_geo;
   auto const pref = prefactor * 4. * std::numbers::pi / box_geo.volume() /
                     (2. * dp3m.params.epsilon + 1.);
@@ -811,15 +825,15 @@ void DipolarP3M::sanity_checks_boxl() const {
   auto const &local_geo = *system.local_geo;
   for (auto i = 0u; i < 3u; i++) {
     /* check k-space cutoff */
-    if (dp3m.params.cao_cut[i] >= box_geo.length_half()[i]) {
+    if (dp3m_params.cao_cut[i] >= box_geo.length_half()[i]) {
       std::stringstream msg;
-      msg << "dipolar P3M_init: k-space cutoff " << dp3m.params.cao_cut[i]
+      msg << "dipolar P3M_init: k-space cutoff " << dp3m_params.cao_cut[i]
           << " is larger than half of box dimension " << box_geo.length()[i];
       throw std::runtime_error(msg.str());
     }
-    if (dp3m.params.cao_cut[i] >= local_geo.length()[i]) {
+    if (dp3m_params.cao_cut[i] >= local_geo.length()[i]) {
       std::stringstream msg;
-      msg << "dipolar P3M_init: k-space cutoff " << dp3m.params.cao_cut[i]
+      msg << "dipolar P3M_init: k-space cutoff " << dp3m_params.cao_cut[i]
           << " is larger than local box dimension " << local_geo.length()[i];
       throw std::runtime_error(msg.str());
     }
@@ -862,7 +876,8 @@ void DipolarP3M::sanity_checks_node_grid() const {
   }
 }
 
-void DipolarP3M::scaleby_box_l() {
+template <typename FloatType, Arch Architecture>
+void DipolarP3MImpl<FloatType, Architecture>::scaleby_box_l() {
   auto const &box_geo = *get_system().box_geo;
   dp3m.params.r_cut = dp3m.params.r_cut_iL * box_geo.length()[0];
   dp3m.params.alpha = dp3m.params.alpha_L * box_geo.length_inv()[0];
@@ -871,6 +886,7 @@ void DipolarP3M::scaleby_box_l() {
   sanity_checks_boxl();
   calc_influence_function_force();
   calc_influence_function_energy();
+  dp3m.energy_correction = 0.;
 }
 
 template <typename FloatType, Arch Architecture>
diff --git a/src/core/magnetostatics/dp3m.hpp b/src/core/magnetostatics/dp3m.hpp
index 6909b44caf..a01624c489 100644
--- a/src/core/magnetostatics/dp3m.hpp
+++ b/src/core/magnetostatics/dp3m.hpp
@@ -49,7 +49,6 @@
 
 #include <cmath>
 #include <numbers>
-#include <stdexcept>
 
 #ifdef NPT
 /** Update the NpT virial */
@@ -58,36 +57,16 @@ void npt_add_virial_magnetic_contribution(double energy);
 
 /** @brief Dipolar P3M solver. */
 struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
-  /** @brief Dipolar P3M parameters. */
-  p3m_data_struct &dp3m;
-
-  int tune_timings;
-  bool tune_verbose;
-
-protected:
-  bool m_is_tuned;
+  P3MParameters const &dp3m_params;
 
 public:
-  DipolarP3M(p3m_data_struct &dp3m_data, double prefactor, int tune_timings,
-             bool tune_verbose)
-      : dp3m{dp3m_data}, tune_timings{tune_timings},
-        tune_verbose{tune_verbose} {
-
-    if (tune_timings <= 0) {
-      throw std::domain_error("Parameter 'timings' must be > 0");
-    }
-    if (dp3m.params.mesh != Utils::Vector3i::broadcast(dp3m.params.mesh[0])) {
-      throw std::domain_error("DipolarP3M requires a cubic mesh");
-    }
-    m_is_tuned = not dp3m.params.tuning;
-    dp3m.params.tuning = false;
-    set_prefactor(prefactor);
-  }
+  DipolarP3M(P3MParameters const &dp3m_params) : dp3m_params{dp3m_params} {}
 
   virtual ~DipolarP3M() = default;
 
-  [[nodiscard]] bool is_tuned() const { return m_is_tuned; }
+  [[nodiscard]] virtual bool is_tuned() const noexcept = 0;
   [[nodiscard]] virtual bool is_gpu() const noexcept = 0;
+  [[nodiscard]] virtual bool is_double_precision() const noexcept = 0;
 
   virtual void on_activation() = 0;
   /** @brief Recalculate all box-length-dependent parameters. */
@@ -126,7 +105,7 @@ struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
    * @ref P3MParameters::r_cut_iL "r_cut_iL" and
    * @ref P3MParameters::alpha_L "alpha_L" are tuned to obtain the target
    * @ref P3MParameters::accuracy "accuracy" in optimal time.
-   * These parameters are stored in the @ref dp3m object.
+   * These parameters are stored in the @ref dp3m_params object.
    *
    * The function utilizes the analytic expression of the error estimate
    * for the dipolar P3M method in the paper of @cite cerda08d in
@@ -163,14 +142,14 @@ struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
   inline ParticleForce pair_force(Particle const &p1, Particle const &p2,
                                   Utils::Vector3d const &d, double dist2,
                                   double dist) const {
-    if ((p1.dipm() == 0.) || (p2.dipm() == 0.) || dist >= dp3m.params.r_cut ||
+    if ((p1.dipm() == 0.) || (p2.dipm() == 0.) || dist >= dp3m_params.r_cut ||
         dist <= 0.)
       return {};
 
     auto const dip1 = p1.calc_dip();
     auto const dip2 = p2.calc_dip();
-    auto const alpsq = dp3m.params.alpha * dp3m.params.alpha;
-    auto const adist = dp3m.params.alpha * dist;
+    auto const alpsq = dp3m_params.alpha * dp3m_params.alpha;
+    auto const adist = dp3m_params.alpha * dist;
 #if USE_ERFC_APPROXIMATION
     auto const erfc_part_ri = Utils::AS_erfc_part(adist) / dist;
 #else
@@ -183,11 +162,11 @@ struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
     auto const mir = dip1 * d;
     auto const mjr = dip2 * d;
 
-    auto const coeff = 2. * dp3m.params.alpha * std::numbers::inv_sqrtpi;
+    auto const coeff = 2. * dp3m_params.alpha * std::numbers::inv_sqrtpi;
     auto const dist2i = 1. / dist2;
     auto const exp_adist2 = exp(-Utils::sqr(adist));
 
-    auto const B_r = (dp3m.params.accuracy > 5e-06)
+    auto const B_r = (dp3m_params.accuracy > 5e-06)
                          ? (erfc_part_ri + coeff) * exp_adist2 * dist2i
                          : (erfc(adist) / dist + coeff * exp_adist2) * dist2i;
 
@@ -221,15 +200,15 @@ struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
   inline double pair_energy(Particle const &p1, Particle const &p2,
                             Utils::Vector3d const &d, double dist2,
                             double dist) const {
-    if ((p1.dipm() == 0.) || (p2.dipm() == 0.) || dist >= dp3m.params.r_cut ||
+    if ((p1.dipm() == 0.) || (p2.dipm() == 0.) || dist >= dp3m_params.r_cut ||
         dist <= 0.)
       return {};
 
     auto const dip1 = p1.calc_dip();
     auto const dip2 = p2.calc_dip();
 
-    auto const alpsq = dp3m.params.alpha * dp3m.params.alpha;
-    auto const adist = dp3m.params.alpha * dist;
+    auto const alpsq = dp3m_params.alpha * dp3m_params.alpha;
+    auto const adist = dp3m_params.alpha * dist;
 
 #if USE_ERFC_APPROXIMATION
     auto const erfc_part_ri = Utils::AS_erfc_part(adist) / dist;
@@ -242,11 +221,11 @@ struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
     auto const mir = dip1 * d;
     auto const mjr = dip2 * d;
 
-    auto const coeff = 2. * dp3m.params.alpha * std::numbers::inv_sqrtpi;
+    auto const coeff = 2. * dp3m_params.alpha * std::numbers::inv_sqrtpi;
     auto const dist2i = 1. / dist2;
     auto const exp_adist2 = exp(-Utils::sqr(adist));
 
-    auto const B_r = (dp3m.params.accuracy > 5e-06)
+    auto const B_r = (dp3m_params.accuracy > 5e-06)
                          ? dist2i * (erfc_part_ri + coeff) * exp_adist2
                          : dist2i * (erfc(adist) / dist + coeff * exp_adist2);
     auto const C_r = (3. * B_r + 2. * alpsq * coeff * exp_adist2) * dist2i;
@@ -270,8 +249,8 @@ struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
   virtual void calc_influence_function_energy() = 0;
 
   /** Compute the dipolar surface terms */
-  double calc_surface_term(bool force_flag, bool energy_flag,
-                           ParticleRange const &particles);
+  virtual double calc_surface_term(bool force_flag, bool energy_flag,
+                                   ParticleRange const &particles) = 0;
 
   /** Checks for correctness of the k-space cutoff. */
   void sanity_checks_boxl() const;
@@ -279,7 +258,7 @@ struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
   void sanity_checks_periodicity() const;
   void sanity_checks_cell_structure() const;
 
-  virtual void scaleby_box_l();
+  virtual void scaleby_box_l() = 0;
 };
 
 #endif // DP3M
diff --git a/src/core/magnetostatics/dp3m.impl.hpp b/src/core/magnetostatics/dp3m.impl.hpp
index 07874460e3..ee50a2bd9d 100644
--- a/src/core/magnetostatics/dp3m.impl.hpp
+++ b/src/core/magnetostatics/dp3m.impl.hpp
@@ -19,16 +19,6 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-/** @file
- *  P3M algorithm for long-range magnetic dipole-dipole interaction.
- *
- *  We use here a P3M (Particle-Particle Particle-Mesh) method based
- *  on the dipolar Ewald summation. Details of the used method can be
- *  found in @cite hockney88a and @cite deserno98a @cite deserno98b.
- *
- *  Further reading: @cite cerda08d
- */
-
 #pragma once
 
 #include "config/config.hpp"
@@ -41,18 +31,19 @@
 #include "p3m/data_struct.hpp"
 #include "p3m/interpolation.hpp"
 
-#include "Particle.hpp"
 #include "ParticleRange.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/math/AS_erfc_part.hpp>
 
 #include <memory>
+#include <stdexcept>
+#include <type_traits>
 #include <utility>
+#include <vector>
 
 template <typename FloatType>
-struct p3m_data_struct_dipoles : public p3m_data_struct_fft<FloatType> {
-  using p3m_data_struct_fft<FloatType>::p3m_data_struct_fft;
+struct p3m_data_struct_dipoles : public p3m_data_struct<FloatType> {
+  using p3m_data_struct<FloatType>::p3m_data_struct;
 
   /** number of dipolar particles (only on head node). */
   int sum_dip_part = 0;
@@ -64,6 +55,8 @@ struct p3m_data_struct_dipoles : public p3m_data_struct_fft<FloatType> {
 
   /** cached k-space self-energy correction */
   double energy_correction = 0.;
+  /** k-space scalar mesh for k-space calculations. */
+  std::vector<FloatType> ks_scalar;
 
   p3m_interpolation_cache inter_weights;
 };
@@ -72,17 +65,34 @@ template <typename FloatType, Arch Architecture>
 struct DipolarP3MImpl : public DipolarP3M {
   ~DipolarP3MImpl() override = default;
 
+  /** @brief Dipolar P3M parameters. */
+  p3m_data_struct_dipoles<FloatType> &dp3m;
+
+private:
   /** @brief Coulomb P3M meshes and FFT algorithm. */
   std::unique_ptr<p3m_data_struct_dipoles<FloatType>> dp3m_impl;
-  // this member overshadows its own base class pointer in the parent class
-  p3m_data_struct_dipoles<FloatType> &dp3m;
+  int tune_timings;
+  bool tune_verbose;
+  bool m_is_tuned;
 
-  template <typename... Args>
+public:
   DipolarP3MImpl(
       std::unique_ptr<p3m_data_struct_dipoles<FloatType>> &&dp3m_handle,
-      Args &&...args)
-      : DipolarP3M(*dp3m_handle, std::forward<Args>(args)...),
-        dp3m_impl{std::move(dp3m_handle)}, dp3m{*dp3m_impl} {}
+      double prefactor, int tune_timings, bool tune_verbose)
+      : DipolarP3M(dp3m_handle->params), dp3m{*dp3m_handle},
+        dp3m_impl{std::move(dp3m_handle)}, tune_timings{tune_timings},
+        tune_verbose{tune_verbose} {
+
+    if (tune_timings <= 0) {
+      throw std::domain_error("Parameter 'timings' must be > 0");
+    }
+    if (dp3m.params.mesh != Utils::Vector3i::broadcast(dp3m.params.mesh[0])) {
+      throw std::domain_error("DipolarP3M requires a cubic mesh");
+    }
+    m_is_tuned = not dp3m.params.tuning;
+    dp3m.params.tuning = false;
+    set_prefactor(prefactor);
+  }
 
   void init() override {
     if constexpr (Architecture == Arch::CPU) {
@@ -92,9 +102,13 @@ struct DipolarP3MImpl : public DipolarP3M {
   void tune() override;
   void count_magnetic_particles() override;
 
+  [[nodiscard]] bool is_tuned() const noexcept override { return m_is_tuned; }
   [[nodiscard]] bool is_gpu() const noexcept override {
     return Architecture == Arch::GPU;
   }
+  [[nodiscard]] bool is_double_precision() const noexcept override {
+    return std::is_same_v<FloatType, double>;
+  }
 
   void on_activation() override {
     sanity_checks();
@@ -121,21 +135,22 @@ struct DipolarP3MImpl : public DipolarP3M {
   void calc_energy_correction() override;
   void calc_influence_function_force() override;
   void calc_influence_function_energy() override;
+  double calc_surface_term(bool force_flag, bool energy_flag,
+                           ParticleRange const &particles) override;
   void init_cpu_kernels();
-  void scaleby_box_l() override {
-    DipolarP3M::scaleby_box_l();
-    dp3m.energy_correction = 0.;
-  }
+  void scaleby_box_l() override;
 };
 
 template <typename FloatType, Arch Architecture,
-          template <typename> class FFTBackendImpl, class... Args>
+          template <typename> class FFTBackendImpl,
+          template <typename> class P3MFFTMeshImpl, class... Args>
 std::shared_ptr<DipolarP3M> new_dp3m_handle(P3MParameters &&p3m,
                                             Args &&...args) {
   auto obj = std::make_shared<DipolarP3MImpl<FloatType, Architecture>>(
       std::make_unique<p3m_data_struct_dipoles<FloatType>>(std::move(p3m)),
       std::forward<Args>(args)...);
-  obj->dp3m.template make_fft_instance<FFTBackendImpl<FloatType>>(true);
+  obj->dp3m.template make_mesh_instance<P3MFFTMeshImpl<FloatType>>();
+  obj->dp3m.template make_fft_instance<FFTBackendImpl<FloatType>>();
   return obj;
 }
 
diff --git a/src/core/p3m/CMakeLists.txt b/src/core/p3m/CMakeLists.txt
index 2a6dd407ba..a32b76fd5d 100644
--- a/src/core/p3m/CMakeLists.txt
+++ b/src/core/p3m/CMakeLists.txt
@@ -17,5 +17,6 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-target_sources(espresso_core PRIVATE common.cpp send_mesh.cpp
-                                     TuningAlgorithm.cpp FFTBackendLegacy.cpp)
+target_sources(
+  espresso_core PRIVATE common.cpp send_mesh.cpp TuningAlgorithm.cpp
+                        FFTBackendLegacy.cpp FFTBuffersLegacy.cpp)
diff --git a/src/core/p3m/FFTBackendLegacy.cpp b/src/core/p3m/FFTBackendLegacy.cpp
index c00f3440c8..b2cce8ea10 100644
--- a/src/core/p3m/FFTBackendLegacy.cpp
+++ b/src/core/p3m/FFTBackendLegacy.cpp
@@ -29,17 +29,11 @@
 
 #include "fft/fft.hpp"
 
-#include <utils/Vector.hpp>
-
-#include <array>
 #include <memory>
-#include <span>
-#include <utility>
 
 template <typename FloatType>
-FFTBackendLegacy<FloatType>::FFTBackendLegacy(
-    p3m_data_struct_fft<FloatType> &obj, bool dipolar)
-    : FFTBackend<FloatType>(obj), dipolar{dipolar},
+FFTBackendLegacy<FloatType>::FFTBackendLegacy(P3MLocalMesh const &local_mesh)
+    : FFTBackend<FloatType>(local_mesh),
       fft{std::make_unique<fft::fft_data_struct<FloatType>>(
           ::Communication::mpiCallbacksHandle()->share_mpi_env())} {}
 
@@ -47,90 +41,20 @@ template <typename FloatType>
 FFTBackendLegacy<FloatType>::~FFTBackendLegacy() = default;
 
 template <typename FloatType>
-void FFTBackendLegacy<FloatType>::update_mesh_data() {
-  auto const mesh_size_ptr = fft->get_mesh_size();
-  auto const mesh_start_ptr = fft->get_mesh_start();
-  for (auto i = 0u; i < 3u; ++i) {
-    mesh.size[i] = mesh_size_ptr[i];
-    mesh.start[i] = mesh_start_ptr[i];
-  }
-  mesh.stop = mesh.start + mesh.size;
-  mesh.ks_scalar = std::span(ks_mesh);
-  mesh.rs_scalar = std::span(rs_mesh);
-  for (auto i = 0u; i < 3u; ++i) {
-    mesh.rs_fields[i] = std::span(rs_mesh_fields[i]);
-  }
-}
-
-template <typename FloatType> void FFTBackendLegacy<FloatType>::init_halo() {
-  mesh_comm.resize(::comm_cart, local_mesh);
-}
-
-template <typename FloatType> void FFTBackendLegacy<FloatType>::init_fft() {
-  auto ca_mesh_size = fft->initialize_fft(
+void FFTBackendLegacy<FloatType>::init(P3MParameters const &params) {
+  ca_mesh_size = fft->initialize_fft(
       ::comm_cart, local_mesh.dim, local_mesh.margin, params.mesh,
-      params.mesh_off, mesh.ks_pnum, ::communicator.node_grid);
-  rs_mesh.resize(ca_mesh_size);
-  if (dipolar) {
-    ks_mesh.resize(ca_mesh_size);
-  }
-  for (auto &rs_mesh_field : rs_mesh_fields) {
-    rs_mesh_field.resize(ca_mesh_size);
-  }
-  update_mesh_data();
-}
-
-template <typename FloatType>
-void FFTBackendLegacy<FloatType>::perform_vector_back_fft() {
-  for (auto &rs_mesh_field : rs_mesh_fields) {
-    fft->backward_fft(::comm_cart, rs_mesh_field.data(),
-                      check_complex_residuals);
-  }
-}
-
-template <typename FloatType>
-void FFTBackendLegacy<FloatType>::perform_vector_halo_spread() {
-  std::array<FloatType *, 3u> meshes = {{rs_mesh_fields[0u].data(),
-                                         rs_mesh_fields[1u].data(),
-                                         rs_mesh_fields[2u].data()}};
-  mesh_comm.spread_grid(::comm_cart, meshes, local_mesh.dim);
-}
-
-template <typename FloatType>
-void FFTBackendLegacy<FloatType>::perform_scalar_halo_gather() {
-  mesh_comm.gather_grid(::comm_cart, rs_mesh.data(), local_mesh.dim);
-}
-
-template <typename FloatType>
-void FFTBackendLegacy<FloatType>::perform_scalar_fwd_fft() {
-  fft->forward_fft(::comm_cart, rs_mesh.data());
-  update_mesh_data();
-}
-
-template <typename FloatType>
-void FFTBackendLegacy<FloatType>::perform_vector_halo_gather() {
-  std::array<FloatType *, 3u> meshes = {{rs_mesh_fields[0u].data(),
-                                         rs_mesh_fields[1u].data(),
-                                         rs_mesh_fields[2u].data()}};
-  mesh_comm.gather_grid(::comm_cart, meshes, local_mesh.dim);
-}
-
-template <typename FloatType>
-void FFTBackendLegacy<FloatType>::perform_vector_fwd_fft() {
-  for (auto &rs_mesh_field : rs_mesh_fields) {
-    fft->forward_fft(::comm_cart, rs_mesh_field.data());
-  }
-  update_mesh_data();
+      params.mesh_off, ks_pnum, ::communicator.node_grid);
 }
 
 template <typename FloatType>
-void FFTBackendLegacy<FloatType>::perform_scalar_back_fft() {
-  fft->backward_fft(::comm_cart, rs_mesh.data(), check_complex_residuals);
+void FFTBackendLegacy<FloatType>::forward_fft(FloatType *rs_mesh) {
+  fft->forward_fft(::comm_cart, rs_mesh);
 }
 
 template <typename FloatType>
-void FFTBackendLegacy<FloatType>::perform_scalar_halo_spread() {
-  mesh_comm.spread_grid(::comm_cart, rs_mesh.data(), local_mesh.dim);
+void FFTBackendLegacy<FloatType>::backward_fft(FloatType *rs_mesh) {
+  fft->backward_fft(::comm_cart, rs_mesh, check_complex_residuals);
 }
 
 template class FFTBackendLegacy<float>;
diff --git a/src/core/p3m/FFTBackendLegacy.hpp b/src/core/p3m/FFTBackendLegacy.hpp
index bf2dd92f7b..fdffe0c718 100644
--- a/src/core/p3m/FFTBackendLegacy.hpp
+++ b/src/core/p3m/FFTBackendLegacy.hpp
@@ -27,9 +27,6 @@
 
 #include "common.hpp"
 #include "data_struct.hpp"
-#include "send_mesh.hpp"
-
-#include "fft/vector.hpp"
 
 #include <array>
 #include <memory>
@@ -49,34 +46,26 @@ class FFTBackendLegacy : public FFTBackend<FloatType> {
   static_assert(std::is_same_v<FloatType, float> or
                     std::is_same_v<FloatType, double>,
                 "FFTW only implements float and double");
-  bool dipolar;
   std::unique_ptr<fft::fft_data_struct<FloatType>> fft;
-  /** @brief k-space mesh (local) for k-space calculations. */
-  std::vector<FloatType> ks_mesh;
-  /** @brief real-space mesh (local) for CA/FFT. */
-  fft::vector<FloatType> rs_mesh;
-  /** @brief real-space mesh (local) for the electric or dipolar field. */
-  std::array<fft::vector<FloatType>, 3> rs_mesh_fields;
-  p3m_send_mesh<FloatType> mesh_comm;
-  using FFTBackend<FloatType>::params;
-  using FFTBackend<FloatType>::mesh;
   using FFTBackend<FloatType>::local_mesh;
   using FFTBackend<FloatType>::check_complex_residuals;
+  int ca_mesh_size = -1;
+  int ks_pnum = -1;
 
 public:
-  FFTBackendLegacy(p3m_data_struct_fft<FloatType> &obj, bool dipolar);
+  FFTBackendLegacy(P3MLocalMesh const &local_mesh);
   ~FFTBackendLegacy() override;
-  void init_fft() override;
-  void init_halo() override;
-  void perform_scalar_fwd_fft() override;
-  void perform_vector_fwd_fft() override;
-  void perform_scalar_back_fft() override;
-  void perform_vector_back_fft() override;
-  void perform_scalar_halo_gather() override;
-  void perform_vector_halo_gather() override;
-  void perform_scalar_halo_spread() override;
-  void perform_vector_halo_spread() override;
-  void update_mesh_data();
+  void init(P3MParameters const &params) override;
+  void forward_fft(FloatType *rs_mesh) override;
+  void backward_fft(FloatType *rs_mesh) override;
+  int get_ca_mesh_size() const noexcept override { return ca_mesh_size; }
+  int get_ks_pnum() const noexcept override { return ks_pnum; }
+  std::array<int, 3u> const &get_mesh_size() const override {
+    return fft->get_mesh_size();
+  }
+  std::array<int, 3u> const &get_mesh_start() const override {
+    return fft->get_mesh_start();
+  }
 
   /**
    * @brief Index helpers for reciprocal space.
diff --git a/src/core/p3m/FFTBuffersLegacy.cpp b/src/core/p3m/FFTBuffersLegacy.cpp
new file mode 100644
index 0000000000..608037f8e2
--- /dev/null
+++ b/src/core/p3m/FFTBuffersLegacy.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2010-2024 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
+ *   Max-Planck-Institute for Polymer Research, Theory Group
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config/config.hpp"
+
+#if defined(P3M) or defined(DP3M)
+
+#include "FFTBuffersLegacy.hpp"
+
+#include "communication.hpp"
+
+#include <array>
+#include <span>
+
+template <typename FloatType>
+FFTBuffersLegacy<FloatType>::~FFTBuffersLegacy() = default;
+
+template <typename FloatType>
+void FFTBuffersLegacy<FloatType>::update_mesh_views(
+    P3MFFTMesh<FloatType> &out) {
+  out.rs_scalar = std::span(rs_mesh);
+  for (auto i = 0u; i < 3u; ++i) {
+    out.rs_fields[i] = std::span(rs_mesh_fields[i]);
+  }
+}
+
+template <typename FloatType> void FFTBuffersLegacy<FloatType>::init_halo() {
+  mesh_comm.resize(::comm_cart, local_mesh);
+}
+
+template <typename FloatType>
+void FFTBuffersLegacy<FloatType>::init_meshes(int ca_mesh_size) {
+  rs_mesh.resize(ca_mesh_size);
+  for (auto &rs_mesh_field : rs_mesh_fields) {
+    rs_mesh_field.resize(ca_mesh_size);
+  }
+}
+
+template <typename FloatType>
+void FFTBuffersLegacy<FloatType>::perform_vector_halo_spread() {
+  std::array<FloatType *, 3u> meshes = {{rs_mesh_fields[0u].data(),
+                                         rs_mesh_fields[1u].data(),
+                                         rs_mesh_fields[2u].data()}};
+  mesh_comm.spread_grid(::comm_cart, meshes, local_mesh.dim);
+}
+
+template <typename FloatType>
+void FFTBuffersLegacy<FloatType>::perform_scalar_halo_gather() {
+  mesh_comm.gather_grid(::comm_cart, rs_mesh.data(), local_mesh.dim);
+}
+
+template <typename FloatType>
+void FFTBuffersLegacy<FloatType>::perform_vector_halo_gather() {
+  std::array<FloatType *, 3u> meshes = {{rs_mesh_fields[0u].data(),
+                                         rs_mesh_fields[1u].data(),
+                                         rs_mesh_fields[2u].data()}};
+  mesh_comm.gather_grid(::comm_cart, meshes, local_mesh.dim);
+}
+
+template <typename FloatType>
+void FFTBuffersLegacy<FloatType>::perform_scalar_halo_spread() {
+  mesh_comm.spread_grid(::comm_cart, rs_mesh.data(), local_mesh.dim);
+}
+
+template <typename FloatType>
+FloatType *FFTBuffersLegacy<FloatType>::get_scalar_mesh() {
+  return rs_mesh.data();
+}
+
+template <typename FloatType>
+std::array<FloatType *, 3u> FFTBuffersLegacy<FloatType>::get_vector_mesh() {
+  return {rs_mesh_fields[0u].data(), rs_mesh_fields[1u].data(),
+          rs_mesh_fields[2u].data()};
+}
+
+template class FFTBuffersLegacy<float>;
+template class FFTBuffersLegacy<double>;
+
+#endif // defined(P3M) or defined(DP3M)
diff --git a/src/core/p3m/FFTBuffersLegacy.hpp b/src/core/p3m/FFTBuffersLegacy.hpp
new file mode 100644
index 0000000000..130515cff1
--- /dev/null
+++ b/src/core/p3m/FFTBuffersLegacy.hpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2010-2024 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
+ *   Max-Planck-Institute for Polymer Research, Theory Group
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "config/config.hpp"
+
+#if defined(P3M) or defined(DP3M)
+
+#include "common.hpp"
+#include "data_struct.hpp"
+#include "send_mesh.hpp"
+
+#include "fft/vector.hpp"
+
+#include <array>
+#include <type_traits>
+
+/** @brief Buffers for @ref FFTBackendLegacy. */
+template <typename FloatType>
+class FFTBuffersLegacy : public FFTBuffers<FloatType> {
+  static_assert(std::is_same_v<FloatType, float> or
+                    std::is_same_v<FloatType, double>,
+                "FFTW only implements float and double");
+  using FFTBuffers<FloatType>::FFTBuffers;
+  using FFTBuffers<FloatType>::local_mesh;
+  p3m_send_mesh<FloatType> mesh_comm;
+  /** @brief real-space mesh (local) for CA/FFT. */
+  fft::vector<FloatType> rs_mesh;
+  /** @brief real-space mesh (local) for the electric or dipolar field. */
+  std::array<fft::vector<FloatType>, 3u> rs_mesh_fields;
+
+public:
+  ~FFTBuffersLegacy() override;
+  void init_halo() override;
+  void init_meshes(int ca_mesh_size) override;
+  void perform_scalar_halo_gather() override;
+  void perform_vector_halo_gather() override;
+  void perform_scalar_halo_spread() override;
+  void perform_vector_halo_spread() override;
+  void update_mesh_views(P3MFFTMesh<FloatType> &out) override;
+  FloatType *get_scalar_mesh() override;
+  std::array<FloatType *, 3u> get_vector_mesh() override;
+};
+
+#endif // defined(P3M) or defined(DP3M)
diff --git a/src/core/p3m/common.hpp b/src/core/p3m/common.hpp
index 4a4b88b22f..3cb9c1cb45 100644
--- a/src/core/p3m/common.hpp
+++ b/src/core/p3m/common.hpp
@@ -219,11 +219,9 @@ struct P3MLocalMesh {
 
 /** @brief Local mesh FFT buffers. */
 template <typename FloatType> struct P3MFFTMesh {
-  /** @brief k-space scalar mesh for k-space calculations. */
-  std::span<FloatType> ks_scalar;
   /** @brief real-space scalar mesh for charge assignment and FFT. */
   std::span<FloatType> rs_scalar;
-  /** @brief real-space 3D meshes for the electric or dipolar field. */
+  /** @brief real-space vector meshes for the electric or dipolar field. */
   std::array<std::span<FloatType>, 3> rs_fields;
 
   /** @brief Indices of the lower left corner of the local mesh grid. */
diff --git a/src/core/p3m/data_struct.hpp b/src/core/p3m/data_struct.hpp
index 7669a5b359..9bd81ad7c2 100644
--- a/src/core/p3m/data_struct.hpp
+++ b/src/core/p3m/data_struct.hpp
@@ -35,13 +35,16 @@
 #include <vector>
 
 template <typename FloatType> class FFTBackend;
+template <typename FloatType> class FFTBuffers;
 
 /**
  * @brief Base class for the electrostatics and magnetostatics P3M algorithms.
  * Contains a handle to the FFT backend, information about the local mesh,
  * the differential operator, and various buffers.
  */
-struct p3m_data_struct {
+template <typename FloatType> struct p3m_data_struct {
+  using value_type = FloatType;
+
   explicit p3m_data_struct(P3MParameters &&parameters)
       : params{std::move(parameters)} {}
 
@@ -49,6 +52,8 @@ struct p3m_data_struct {
   P3MParameters params;
   /** @brief Local mesh properties. */
   P3MLocalMesh local_mesh;
+  /** @brief Local mesh FFT buffers. */
+  P3MFFTMesh<FloatType> mesh;
 
   /**
    * @brief Spatial differential operator in k-space.
@@ -63,57 +68,81 @@ struct p3m_data_struct {
   void calc_differential_operator() {
     d_op = detail::calc_meshift(params.mesh, true);
   }
-};
-
-template <typename FloatType>
-struct p3m_data_struct_fft : public p3m_data_struct {
-  using p3m_data_struct::p3m_data_struct;
-  using value_type = FloatType;
-  /** @brief Local mesh FFT buffers. */
-  P3MFFTMesh<FloatType> mesh;
 
-  /** Force optimised influence function (k-space) */
+  /** @brief Force optimised influence function (k-space) */
   std::vector<FloatType> g_force;
-  /** Energy optimised influence function (k-space) */
+  /** @brief Energy optimised influence function (k-space) */
   std::vector<FloatType> g_energy;
-  /** FFT backend. */
+  /** @brief FFT algorithm. */
   std::unique_ptr<FFTBackend<FloatType>> fft;
+  /** @brief FFT buffers. */
+  std::unique_ptr<FFTBuffers<FloatType>> fft_buffers;
+
+  void init();
+
+  void update_mesh_views() {
+    auto const mesh_size_ptr = fft->get_mesh_size();
+    auto const mesh_start_ptr = fft->get_mesh_start();
+    for (auto i = 0u; i < 3u; ++i) {
+      mesh.size[i] = mesh_size_ptr[i];
+      mesh.start[i] = mesh_start_ptr[i];
+    }
+    mesh.stop = mesh.start + mesh.size;
+    fft_buffers->update_mesh_views(mesh);
+  }
 
   template <typename T, class... Args> void make_fft_instance(Args... args) {
     assert(fft == nullptr);
-    fft = std::make_unique<T>(*this, args...);
+    fft = std::make_unique<T>(std::as_const(local_mesh), args...);
+  }
+
+  template <typename T, class... Args> void make_mesh_instance(Args... args) {
+    assert(fft_buffers == nullptr);
+    fft_buffers = std::make_unique<T>(std::as_const(local_mesh), args...);
   }
 };
 
 /**
  * @brief API for the FFT backend of the P3M algorithm.
- * Any FFT backend must implement this interface.
- * The backend can read some members of @ref p3m_data_struct
- * but can only modify the FFT buffers in @ref P3MFFTMesh.
  */
 template <typename FloatType> class FFTBackend {
 protected:
-  P3MParameters const &params;
   P3MLocalMesh const &local_mesh;
-  P3MFFTMesh<FloatType> &mesh;
 
 public:
   bool check_complex_residuals = false;
-  explicit FFTBackend(p3m_data_struct_fft<FloatType> &obj)
-      : params{obj.params}, local_mesh{obj.local_mesh}, mesh{obj.mesh} {}
+  explicit FFTBackend(P3MLocalMesh const &local_mesh)
+      : local_mesh{local_mesh} {}
   virtual ~FFTBackend() = default;
-  /** @brief Initialize the FFT plans and buffers. */
-  virtual void init_fft() = 0;
-  /** @brief Initialize the halo buffers. */
-  virtual void init_halo() = 0;
+  virtual void init(P3MParameters const &params) = 0;
+  virtual int get_ca_mesh_size() const noexcept = 0;
+  virtual int get_ks_pnum() const noexcept = 0;
   /** @brief Carry out the forward FFT of the scalar mesh. */
-  virtual void perform_scalar_fwd_fft() = 0;
-  /** @brief Carry out the forward FFT of the vector meshes. */
-  virtual void perform_vector_fwd_fft() = 0;
+  virtual void forward_fft(FloatType *rs_mesh) = 0;
   /** @brief Carry out the backward FFT of the scalar mesh. */
-  virtual void perform_scalar_back_fft() = 0;
-  /** @brief Carry out the backward FFT of the vector meshes. */
-  virtual void perform_vector_back_fft() = 0;
+  virtual void backward_fft(FloatType *rs_mesh) = 0;
+  /** @brief Get indices of the k-space data layout. */
+  virtual std::tuple<int, int, int> get_permutations() const = 0;
+  virtual std::array<int, 3u> const &get_mesh_size() const = 0;
+  virtual std::array<int, 3u> const &get_mesh_start() const = 0;
+};
+
+/**
+ * @brief API for the FFT mesh buffers.
+ */
+template <typename FloatType> class FFTBuffers {
+protected:
+  P3MLocalMesh const &local_mesh;
+
+public:
+  bool check_complex_residuals = false;
+  explicit FFTBuffers(P3MLocalMesh const &local_mesh)
+      : local_mesh{local_mesh} {}
+  virtual ~FFTBuffers() = default;
+  /** @brief Initialize the meshes. */
+  virtual void init_meshes(int ca_mesh_size) = 0;
+  /** @brief Initialize the halo buffers. */
+  virtual void init_halo() = 0;
   /** @brief Update scalar mesh halo with data from neighbors (accumulation). */
   virtual void perform_scalar_halo_gather() = 0;
   /** @brief Update vector mesh halo with data from neighbors (accumulation). */
@@ -122,8 +151,21 @@ template <typename FloatType> class FFTBackend {
   virtual void perform_scalar_halo_spread() = 0;
   /** @brief Update vector mesh halo of all neighbors. */
   virtual void perform_vector_halo_spread() = 0;
-  /** @brief Get indices of the k-space data layout. */
-  virtual std::tuple<int, int, int> get_permutations() const = 0;
+  /**
+   * @brief Get pointer to scalar mesh begin.
+   * Should only be used by @ref FFTBackend.
+   */
+  virtual FloatType *get_scalar_mesh() = 0;
+  /**
+   * @brief Get pointer to vector mesh begin.
+   * Should only be used by @ref FFTBackend.
+   */
+  virtual std::array<FloatType *, 3u> get_vector_mesh() = 0;
+  /**
+   * @brief Update the scalar and vector mesh views in @ref P3MFFTMesh
+   * to point to the new underlying data structures.
+   */
+  virtual void update_mesh_views(P3MFFTMesh<FloatType> &out) = 0;
 };
 
 #endif // defined(P3M) or defined(DP3M)
diff --git a/src/core/p3m/packing.hpp b/src/core/p3m/packing.hpp
new file mode 100644
index 0000000000..19f131a58c
--- /dev/null
+++ b/src/core/p3m/packing.hpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (C) 2010-2024 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
+ *   Max-Planck-Institute for Polymer Research, Theory Group
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstring>
+
+/**
+ * @brief Pack a 3D-block of size @c size starting at @c start of an input
+ * 3D-grid @c in with dimension @c dim into an output 3D-block @c out.
+ *
+ * The block is stored in 'row-major-order' or 'C-order', that means
+ * the first index is changing slowest when running through the linear array.
+ * The element (@c i0 (slow), @c i1 (mid), @c i2 (fast)) has the linear
+ * index <tt>li = i2 + size[2] * (i1 + (size[1] * i0))</tt>.
+ *
+ * @param[in]  in      pointer to input 3D-grid.
+ * @param[out] out     pointer to output 3D-block.
+ * @param[in]  start   start index of the block in the input 3D-grid.
+ * @param[in]  size    size of the output 3D-block.
+ * @param[in]  dim     size of the input 3D-grid.
+ * @param[in]  element size of a grid element (e.g. 1 for Real, 2 for Complex).
+ */
+template <typename FloatType>
+void fft_pack_block(FloatType const *const in, FloatType *const out,
+                    int const *start, int const *size, int const *dim,
+                    int element) {
+
+  auto const copy_size =
+      static_cast<std::size_t>(element * size[2]) * sizeof(FloatType);
+  /* offsets for indices in input grid */
+  auto const m_in_offset = element * dim[2];
+  auto const s_in_offset = element * (dim[2] * (dim[1] - size[1]));
+  /* offsets for indices in output grid */
+  auto const m_out_offset = element * size[2];
+  /* linear index of input grid, linear index of output grid */
+  int li_in = element * (start[2] + dim[2] * (start[1] + dim[1] * start[0]));
+  int li_out = 0;
+
+  for (int s = 0; s < size[0]; s++) {
+    for (int m = 0; m < size[1]; m++) {
+      memmove(&(out[li_out]), &(in[li_in]), copy_size);
+      li_in += m_in_offset;
+      li_out += m_out_offset;
+    }
+    li_in += s_in_offset;
+  }
+}
+
+/**
+ * @brief Unpack a 3D-block @c in of size @c size into an output
+ * 3D-grid @c out of size @c dim starting at position @c start.
+ *
+ * See also @ref fft_pack_block.
+ *
+ * @param[in]  in      pointer to input 3D-block.
+ * @param[out] out     pointer to output 3D-grid.
+ * @param[in]  start   start index of the block in the input 3D-grid.
+ * @param[in]  size    size of the input 3D-block.
+ * @param[in]  dim     size of the output 3D-grid.
+ * @param[in]  element size of a grid element (e.g. 1 for Real, 2 for Complex).
+ */
+template <typename FloatType>
+void fft_unpack_block(FloatType const *const in, FloatType *const out,
+                      int const *start, int const *size, int const *dim,
+                      int element) {
+
+  auto const copy_size =
+      static_cast<std::size_t>(element * size[2]) * sizeof(FloatType);
+  /* offsets for indices in output grid */
+  auto const m_out_offset = element * dim[2];
+  auto const s_out_offset = element * (dim[2] * (dim[1] - size[1]));
+  /* offset for indices in input grid */
+  auto const m_in_offset = element * size[2];
+  /* linear index of in grid, linear index of out grid */
+  int li_in = 0;
+  int li_out = element * (start[2] + dim[2] * (start[1] + dim[1] * start[0]));
+
+  for (int s = 0; s < size[0]; s++) {
+    for (int m = 0; m < size[1]; m++) {
+      memmove(&(out[li_out]), &(in[li_in]), copy_size);
+      li_in += m_in_offset;
+      li_out += m_out_offset;
+    }
+    li_out += s_out_offset;
+  }
+}
diff --git a/src/core/p3m/send_mesh.cpp b/src/core/p3m/send_mesh.cpp
index 4707936196..ee711eae3e 100644
--- a/src/core/p3m/send_mesh.cpp
+++ b/src/core/p3m/send_mesh.cpp
@@ -25,6 +25,7 @@
 
 #include "fft/fft.hpp"
 #include "p3m/common.hpp"
+#include "p3m/packing.hpp"
 #include "p3m/send_mesh.hpp"
 
 #include <utils/Vector.hpp>
@@ -174,8 +175,8 @@ void p3m_send_mesh<FloatType>::gather_grid(boost::mpi::communicator const &comm,
     /* pack send block */
     if (s_size[s_dir] > 0) {
       for (std::size_t i = 0; i < meshes.size(); i++) {
-        fft::fft_pack_block(meshes[i], send_grid.data() + i * s_size[s_dir],
-                            s_ld[s_dir], s_dim[s_dir], dim.data(), 1);
+        fft_pack_block(meshes[i], send_grid.data() + i * s_size[s_dir],
+                       s_ld[s_dir], s_dim[s_dir], dim.data(), 1);
       }
     }
 
@@ -214,8 +215,8 @@ void p3m_send_mesh<FloatType>::spread_grid(boost::mpi::communicator const &comm,
     /* pack send block */
     if (r_size[r_dir] > 0) {
       for (std::size_t i = 0; i < meshes.size(); i++) {
-        fft::fft_pack_block(meshes[i], send_grid.data() + i * r_size[r_dir],
-                            r_ld[r_dir], r_dim[r_dir], dim.data(), 1);
+        fft_pack_block(meshes[i], send_grid.data() + i * r_size[r_dir],
+                       r_ld[r_dir], r_dim[r_dir], dim.data(), 1);
       }
     }
     /* communication */
@@ -231,8 +232,8 @@ void p3m_send_mesh<FloatType>::spread_grid(boost::mpi::communicator const &comm,
     /* unpack recv block */
     if (s_size[s_dir] > 0) {
       for (std::size_t i = 0; i < meshes.size(); i++) {
-        fft::fft_unpack_block(recv_grid.data() + i * s_size[s_dir], meshes[i],
-                              s_ld[s_dir], s_dim[s_dir], dim.data(), 1);
+        fft_unpack_block(recv_grid.data() + i * s_size[s_dir], meshes[i],
+                         s_ld[s_dir], s_dim[s_dir], dim.data(), 1);
       }
     }
   }
diff --git a/src/core/unit_tests/EspressoSystemStandAlone_test.cpp b/src/core/unit_tests/EspressoSystemStandAlone_test.cpp
index c2f4a9d125..58a9bba30e 100644
--- a/src/core/unit_tests/EspressoSystemStandAlone_test.cpp
+++ b/src/core/unit_tests/EspressoSystemStandAlone_test.cpp
@@ -53,6 +53,7 @@ namespace utf = boost::unit_test;
 #include "observables/ParticleVelocities.hpp"
 #include "observables/PidObservable.hpp"
 #include "p3m/FFTBackendLegacy.hpp"
+#include "p3m/FFTBuffersLegacy.hpp"
 #include "particle_node.hpp"
 #include "system/System.hpp"
 
@@ -234,8 +235,9 @@ BOOST_FIXTURE_TEST_CASE(espresso_system_stand_alone, ParticleFactory) {
                              5,
                              0.615,
                              1e-3};
-    auto solver = new_p3m_handle<double, Arch::CPU, FFTBackendLegacy>(
-        std::move(p3m), prefactor, 1, false, true);
+    auto solver =
+        new_p3m_handle<double, Arch::CPU, FFTBackendLegacy, FFTBuffersLegacy>(
+            std::move(p3m), prefactor, 1, false, true);
     add_actor(comm, espresso::system, system.coulomb.impl->solver, solver,
               [&system]() { system.on_coulomb_change(); });
     BOOST_CHECK(not solver->is_gpu());
@@ -302,8 +304,9 @@ BOOST_FIXTURE_TEST_CASE(espresso_system_stand_alone, ParticleFactory) {
                              5,
                              0.615,
                              1e-3};
-    auto solver = new_dp3m_handle<double, Arch::CPU, FFTBackendLegacy>(
-        std::move(p3m), prefactor, 1, false);
+    auto solver =
+        new_dp3m_handle<double, Arch::CPU, FFTBackendLegacy, FFTBuffersLegacy>(
+            std::move(p3m), prefactor, 1, false);
     add_actor(comm, espresso::system, system.dipoles.impl->solver, solver,
               [&system]() { system.on_dipoles_change(); });
     BOOST_CHECK(not solver->is_gpu());
diff --git a/src/core/unit_tests/fft_test.cpp b/src/core/unit_tests/fft_test.cpp
index bf8100f887..9d5a3bf98e 100644
--- a/src/core/unit_tests/fft_test.cpp
+++ b/src/core/unit_tests/fft_test.cpp
@@ -20,15 +20,14 @@
 #define BOOST_TEST_MODULE "FFT utility functions"
 #define BOOST_TEST_DYN_LINK
 
-#include "config/config.hpp"
-
-#if defined(P3M) or defined(DP3M)
-
 #include <boost/test/unit_test.hpp>
 
+#include "config/config.hpp"
+
 #include "fft/fft.hpp"
 #include "fft/vector.hpp"
 #include "p3m/for_each_3d.hpp"
+#include "p3m/packing.hpp"
 
 #include <utils/Vector.hpp>
 
@@ -41,6 +40,8 @@
 #include <stdexcept>
 #include <vector>
 
+#if defined(P3M) or defined(DP3M)
+
 BOOST_AUTO_TEST_CASE(fft_find_comm_groups_mismatch) {
   using fft::find_comm_groups;
   int my_pos[3] = {0};
@@ -170,6 +171,8 @@ BOOST_AUTO_TEST_CASE(fft_plan_without_mpi) {
   delete plan;
 }
 
+#endif // defined(P3M) or defined(DP3M)
+
 BOOST_AUTO_TEST_CASE(for_each_3d_test) {
   auto const m_start = Utils::Vector3i{{0, -1, 3}};
   auto const m_stop = Utils::Vector3i{{2, 2, 5}};
@@ -207,6 +210,90 @@ BOOST_AUTO_TEST_CASE(for_each_3d_test) {
   }
 }
 
-#else  // defined(P3M) or defined(DP3M)
-int main(int argc, char **argv) {}
-#endif // defined(P3M) or defined(DP3M)
+template <typename T, int dim1, int dim2, int dim3, typename F>
+void run_on_grid(T (&grid)[dim1][dim2][dim3], F fun) {
+  for (auto i = 0; i < dim1; ++i) {
+    for (auto j = 0; j < dim2; ++j) {
+      for (auto k = 0; k < dim3; ++k) {
+        fun(grid, i, j, k);
+      }
+    }
+  }
+}
+
+BOOST_AUTO_TEST_CASE(packing_unpacking) {
+  // packing into block
+  {
+    int const dim[3] = {4, 6, 8};
+    int mesh[4][6][8] = {};
+    run_on_grid(mesh, [](auto &mesh, int i, int j, int k) {
+      mesh[i][j][k] = (i + 1) * (j + 1) * (k + 1);
+    });
+    int const start[3] = {1, 2, 3};
+    int const size[3] = {2, 3, 4};
+    int block[2][3][4] = {};
+    run_on_grid(block, [](auto &gd, int i, int j, int k) { gd[i][j][k] = -1; });
+    fft_pack_block(&(mesh[0][0][0]), &(block[0][0][0]), start, size, dim, 1);
+    run_on_grid(block, [&start](auto &block, int i, int j, int k) {
+      auto const ref =
+          (i + 1 + start[0u]) * (j + 1 + start[1u]) * (k + 1 + start[2u]);
+      BOOST_REQUIRE_EQUAL(block[i][j][k], ref);
+    });
+  }
+
+  // packing into self
+  {
+    int const dim[3] = {1, 1, 8};
+    int mesh[1][1][8] = {};
+    run_on_grid(mesh, [](auto &gd, int i, int j, int k) { gd[i][j][k] = k; });
+    int const start[3] = {0, 0, 0};
+    int const size[3] = {1, 1, 4};
+    fft_pack_block(&(mesh[0][0][0]), &(mesh[0][0][3]), start, size, dim, 1);
+    run_on_grid(mesh, [&size](auto &mesh, int i, int j, int k) {
+      auto ref = k;
+      if (k >= 3 and k < 3 + size[2u]) {
+        ref = k - 3;
+      }
+      BOOST_REQUIRE_EQUAL(mesh[i][j][k], ref);
+    });
+  }
+
+  // unpacking into grid
+  {
+    int const dim[3] = {4, 6, 8};
+    int mesh[4][6][8] = {};
+    run_on_grid(mesh, [](auto &mesh, int i, int j, int k) {
+      mesh[i][j][k] = (i + 1) * (j + 1) * (k + 1);
+    });
+    int const start[3] = {1, 2, 3};
+    int const size[3] = {2, 3, 4};
+    int block[2][3][4] = {};
+    run_on_grid(block, [](auto &block, int i, int j, int k) {
+      block[i][j][k] = -(i + 1) * (j + 1) * (k + 1);
+    });
+    fft_unpack_block(&(block[0][0][0]), &(mesh[0][0][0]), start, size, dim, 1);
+    run_on_grid(mesh, [&start, &size](auto &mesh, int i, int j, int k) {
+      auto ref = (i + 1) * (j + 1) * (k + 1);
+      if ((i >= start[0] and i < (start[0] + size[0])) and
+          (j >= start[1] and j < (start[1] + size[1])) and
+          (k >= start[2] and k < (start[2] + size[2]))) {
+        ref = -(i + 1 - start[0]) * (j + 1 - start[1]) * (k + 1 - start[2]);
+      }
+      BOOST_REQUIRE_EQUAL(mesh[i][j][k], ref);
+    });
+  }
+
+  // unpacking into self
+  {
+    int const dim[3] = {1, 1, 8};
+    int mesh[1][1][8] = {};
+    run_on_grid(mesh, [](auto &gd, int i, int j, int k) { gd[i][j][k] = k; });
+    int const start[3] = {0, 0, 0};
+    int const size[3] = {1, 1, 4};
+    fft_unpack_block(&(mesh[0][0][3]), &(mesh[0][0][0]), start, size, dim, 1);
+    run_on_grid(mesh, [&size](auto &mesh, int i, int j, int k) {
+      auto const ref = (k < size[2u]) ? k + 3 : k;
+      BOOST_REQUIRE_EQUAL(mesh[i][j][k], ref);
+    });
+  }
+}
diff --git a/src/script_interface/electrostatics/CoulombP3M.hpp b/src/script_interface/electrostatics/CoulombP3M.hpp
index 78d1e4d8e2..5e16440de9 100644
--- a/src/script_interface/electrostatics/CoulombP3M.hpp
+++ b/src/script_interface/electrostatics/CoulombP3M.hpp
@@ -28,18 +28,28 @@
 #include "core/electrostatics/p3m.hpp"
 #include "core/electrostatics/p3m.impl.hpp"
 #include "core/p3m/FFTBackendLegacy.hpp"
+#include "core/p3m/FFTBuffersLegacy.hpp"
 
 #include "script_interface/get_value.hpp"
 
 #include <memory>
+#include <stdexcept>
 #include <string>
+#include <utility>
+
+#ifdef FFTW3_H
+#error "The FFTW3 library shouldn't be visible in this translation unit"
+#endif
 
 namespace ScriptInterface {
 namespace Coulomb {
 
 template <Arch Architecture>
 class CoulombP3M : public Actor<CoulombP3M<Architecture>, ::CoulombP3M> {
+  int m_tune_timings;
   bool m_tune;
+  bool m_tune_verbose;
+  bool m_check_complex_residuals;
   bool m_single_precision;
 
 public:
@@ -56,44 +66,48 @@ class CoulombP3M : public Actor<CoulombP3M<Architecture>, ::CoulombP3M> {
   CoulombP3M() {
     add_parameters({
         {"single_precision", AutoParameter::read_only,
-         [this]() { return m_single_precision; }},
+         [this]() { return not actor()->is_double_precision(); }},
         {"alpha_L", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.alpha_L; }},
+         [this]() { return actor()->p3m_params.alpha_L; }},
         {"r_cut_iL", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.r_cut_iL; }},
+         [this]() { return actor()->p3m_params.r_cut_iL; }},
         {"mesh", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.mesh; }},
+         [this]() { return actor()->p3m_params.mesh; }},
         {"mesh_off", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.mesh_off; }},
+         [this]() { return actor()->p3m_params.mesh_off; }},
         {"cao", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.cao; }},
+         [this]() { return actor()->p3m_params.cao; }},
         {"accuracy", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.accuracy; }},
+         [this]() { return actor()->p3m_params.accuracy; }},
         {"epsilon", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.epsilon; }},
+         [this]() { return actor()->p3m_params.epsilon; }},
         {"a", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.a; }},
+         [this]() { return actor()->p3m_params.a; }},
         {"alpha", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.alpha; }},
+         [this]() { return actor()->p3m_params.alpha; }},
         {"r_cut", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.r_cut; }},
+         [this]() { return actor()->p3m_params.r_cut; }},
         {"is_tuned", AutoParameter::read_only,
          [this]() { return actor()->is_tuned(); }},
         {"verbose", AutoParameter::read_only,
-         [this]() { return actor()->tune_verbose; }},
+         [this]() { return m_tune_verbose; }},
         {"timings", AutoParameter::read_only,
-         [this]() { return actor()->tune_timings; }},
+         [this]() { return m_tune_timings; }},
         {"tune", AutoParameter::read_only, [this]() { return m_tune; }},
         {"check_complex_residuals", AutoParameter::read_only,
-         [this]() { return actor()->check_complex_residuals; }},
+         [this]() { return m_check_complex_residuals; }},
     });
   }
 
   void do_construct(VariantMap const &params) override {
     m_tune = get_value<bool>(params, "tune");
-    m_single_precision = get_value<bool>(params, "single_precision");
+    m_tune_timings = get_value<int>(params, "timings");
+    m_tune_verbose = get_value<bool>(params, "verbose");
+    m_check_complex_residuals =
+        get_value<bool>(params, "check_complex_residuals");
+    auto const single_precision = get_value<bool>(params, "single_precision");
     context()->parallel_try_catch([&]() {
-      if (Architecture == Arch::GPU and not m_single_precision) {
+      if (Architecture == Arch::GPU and not single_precision) {
         throw std::invalid_argument(
             "P3M GPU only implemented in single-precision mode");
       }
@@ -105,24 +119,25 @@ class CoulombP3M : public Actor<CoulombP3M<Architecture>, ::CoulombP3M> {
                                get_value<int>(params, "cao"),
                                get_value<double>(params, "alpha"),
                                get_value<double>(params, "accuracy")};
-      make_handle(m_single_precision, std::move(p3m),
-                  get_value<double>(params, "prefactor"),
-                  get_value<int>(params, "timings"),
-                  get_value<bool>(params, "verbose"),
-                  get_value<bool>(params, "check_complex_residuals"));
+      make_handle(single_precision, std::move(p3m),
+                  get_value<double>(params, "prefactor"), m_tune_timings,
+                  m_tune_verbose, m_check_complex_residuals);
     });
     set_charge_neutrality_tolerance(params);
   }
 
 private:
+  template <typename FloatType, class... Args>
+  void make_handle_impl(Args &&...args) {
+    m_actor = new_p3m_handle<FloatType, Architecture, FFTBackendLegacy,
+                             FFTBuffersLegacy>(std::forward<Args>(args)...);
+  }
   template <class... Args>
   void make_handle(bool single_precision, Args &&...args) {
     if (single_precision) {
-      m_actor = new_p3m_handle<float, Architecture, FFTBackendLegacy>(
-          std::forward<Args>(args)...);
+      make_handle_impl<float, Args...>(std::forward<Args>(args)...);
     } else {
-      m_actor = new_p3m_handle<double, Architecture, FFTBackendLegacy>(
-          std::forward<Args>(args)...);
+      make_handle_impl<double, Args...>(std::forward<Args>(args)...);
     }
   }
 };
diff --git a/src/script_interface/magnetostatics/DipolarP3M.hpp b/src/script_interface/magnetostatics/DipolarP3M.hpp
index 20dff5968e..67a2c2205d 100644
--- a/src/script_interface/magnetostatics/DipolarP3M.hpp
+++ b/src/script_interface/magnetostatics/DipolarP3M.hpp
@@ -28,18 +28,27 @@
 #include "core/magnetostatics/dp3m.hpp"
 #include "core/magnetostatics/dp3m.impl.hpp"
 #include "core/p3m/FFTBackendLegacy.hpp"
+#include "core/p3m/FFTBuffersLegacy.hpp"
 
 #include "script_interface/get_value.hpp"
 
 #include <memory>
+#include <stdexcept>
+#include <string>
+#include <utility>
+
+#ifdef FFTW3_H
+#error "The FFTW3 library shouldn't be visible in this translation unit"
+#endif
 
 namespace ScriptInterface {
 namespace Dipoles {
 
 template <Arch Architecture>
 class DipolarP3M : public Actor<DipolarP3M<Architecture>, ::DipolarP3M> {
+  int m_tune_timings;
   bool m_tune;
-  bool m_single_precision;
+  bool m_tune_verbose;
 
 public:
   using Base = Actor<DipolarP3M<Architecture>, ::DipolarP3M>;
@@ -54,40 +63,42 @@ class DipolarP3M : public Actor<DipolarP3M<Architecture>, ::DipolarP3M> {
   DipolarP3M() {
     add_parameters({
         {"single_precision", AutoParameter::read_only,
-         [this]() { return m_single_precision; }},
+         [this]() { return not actor()->is_double_precision(); }},
         {"alpha_L", AutoParameter::read_only,
-         [this]() { return actor()->dp3m.params.alpha_L; }},
+         [this]() { return actor()->dp3m_params.alpha_L; }},
         {"r_cut_iL", AutoParameter::read_only,
-         [this]() { return actor()->dp3m.params.r_cut_iL; }},
+         [this]() { return actor()->dp3m_params.r_cut_iL; }},
         {"mesh", AutoParameter::read_only,
-         [this]() { return actor()->dp3m.params.mesh; }},
+         [this]() { return actor()->dp3m_params.mesh; }},
         {"mesh_off", AutoParameter::read_only,
-         [this]() { return actor()->dp3m.params.mesh_off; }},
+         [this]() { return actor()->dp3m_params.mesh_off; }},
         {"cao", AutoParameter::read_only,
-         [this]() { return actor()->dp3m.params.cao; }},
+         [this]() { return actor()->dp3m_params.cao; }},
         {"accuracy", AutoParameter::read_only,
-         [this]() { return actor()->dp3m.params.accuracy; }},
+         [this]() { return actor()->dp3m_params.accuracy; }},
         {"epsilon", AutoParameter::read_only,
-         [this]() { return actor()->dp3m.params.epsilon; }},
+         [this]() { return actor()->dp3m_params.epsilon; }},
         {"a", AutoParameter::read_only,
-         [this]() { return actor()->dp3m.params.a; }},
+         [this]() { return actor()->dp3m_params.a; }},
         {"alpha", AutoParameter::read_only,
-         [this]() { return actor()->dp3m.params.alpha; }},
+         [this]() { return actor()->dp3m_params.alpha; }},
         {"r_cut", AutoParameter::read_only,
-         [this]() { return actor()->dp3m.params.r_cut; }},
+         [this]() { return actor()->dp3m_params.r_cut; }},
         {"is_tuned", AutoParameter::read_only,
          [this]() { return actor()->is_tuned(); }},
         {"verbose", AutoParameter::read_only,
-         [this]() { return actor()->tune_verbose; }},
+         [this]() { return m_tune_verbose; }},
         {"timings", AutoParameter::read_only,
-         [this]() { return actor()->tune_timings; }},
+         [this]() { return m_tune_timings; }},
         {"tune", AutoParameter::read_only, [this]() { return m_tune; }},
     });
   }
 
   void do_construct(VariantMap const &params) override {
     m_tune = get_value<bool>(params, "tune");
-    m_single_precision = get_value<bool>(params, "single_precision");
+    m_tune_timings = get_value<int>(params, "timings");
+    m_tune_verbose = get_value<bool>(params, "verbose");
+    auto const single_precision = get_value<bool>(params, "single_precision");
     static_assert(Architecture == Arch::CPU, "GPU not implemented");
     context()->parallel_try_catch([&]() {
       auto p3m = P3MParameters{!get_value_or<bool>(params, "is_tuned", !m_tune),
@@ -98,22 +109,24 @@ class DipolarP3M : public Actor<DipolarP3M<Architecture>, ::DipolarP3M> {
                                get_value<int>(params, "cao"),
                                get_value<double>(params, "alpha"),
                                get_value<double>(params, "accuracy")};
-      make_handle(m_single_precision, std::move(p3m),
-                  get_value<double>(params, "prefactor"),
-                  get_value<int>(params, "timings"),
-                  get_value<bool>(params, "verbose"));
+      make_handle(single_precision, std::move(p3m),
+                  get_value<double>(params, "prefactor"), m_tune_timings,
+                  m_tune_verbose);
     });
   }
 
 private:
+  template <typename FloatType, class... Args>
+  void make_handle_impl(Args &&...args) {
+    m_actor = new_dp3m_handle<FloatType, Architecture, FFTBackendLegacy,
+                              FFTBuffersLegacy>(std::forward<Args>(args)...);
+  }
   template <class... Args>
   void make_handle(bool single_precision, Args &&...args) {
     if (single_precision) {
-      m_actor = new_dp3m_handle<float, Architecture, FFTBackendLegacy>(
-          std::forward<Args>(args)...);
+      make_handle_impl<float, Args...>(std::forward<Args>(args)...);
     } else {
-      m_actor = new_dp3m_handle<double, Architecture, FFTBackendLegacy>(
-          std::forward<Args>(args)...);
+      make_handle_impl<double, Args...>(std::forward<Args>(args)...);
     }
   }
 };