diff --git a/maintainer/benchmarks/lb.py b/maintainer/benchmarks/lb.py
index 04453c5ea7..db3ad9726c 100644
--- a/maintainer/benchmarks/lb.py
+++ b/maintainer/benchmarks/lb.py
@@ -50,6 +50,11 @@
 parser.add_argument("--output", metavar="FILEPATH", action="store",
                     type=str, required=False, default="benchmarks.csv",
                     help="Output file (default: benchmarks.csv)")
+parser.add_argument("--blocks_per_mpi_rank", action="store", nargs=3,
+                    type=int, default=[1, 1, 1], required=False,
+                    help="blocks per mpi rank")
+parser.add_argument("--weak_scaling", action="store_true", required=False,
+                    help="The measurement of weak scaling")
 
 args = parser.parse_args()
 
@@ -101,13 +106,15 @@
     lb_grid = 3 * [lb_grid]
     box_l = 3 * [box_l]
 
-print(f"box length: {box_l}")
-print(f"LB shape: {lb_grid}")
-print(f"LB agrid: {agrid:.3f}")
-
 # System
 #############################################################
 system.box_l = box_l
+if args.weak_scaling:
+    system.box_l = box_l * system.cell_system.node_grid
+print(f"box length: {system.box_l}")
+print(f"LB shape: {lb_grid}")
+print(f"LB agrid: {agrid:.3f}")
+
 
 # Integration parameters
 #############################################################
@@ -145,7 +152,7 @@
 if args.multi_gpu:
     system.cuda_init_handle.call_method("set_device_id_per_rank")
 lbf = lb_class(agrid=agrid, tau=system.time_step, kinematic_viscosity=1.,
-               density=1., single_precision=args.single_precision)
+               density=1., single_precision=args.single_precision, blocks_per_mpi_rank=args.blocks_per_mpi_rank)
 system.lb = lbf
 if n_part:
     system.thermostat.set_lb(LB_fluid=lbf, gamma=1., seed=42)
diff --git a/src/core/integrate.cpp b/src/core/integrate.cpp
index 9ed1d628a4..badbc8f142 100644
--- a/src/core/integrate.cpp
+++ b/src/core/integrate.cpp
@@ -633,12 +633,18 @@ int System::System::integrate(int n_steps, int reuse_forces) {
           ek.propagate();
         }
       } else if (lb_active) {
+#ifdef CALIPER
+        CALI_MARK_BEGIN("LB.PROPAGATE");
+#endif
         auto const md_steps_per_lb_step = calc_md_steps_per_tau(lb.get_tau());
         propagation.lb_skipped_md_steps += 1;
         if (propagation.lb_skipped_md_steps >= md_steps_per_lb_step) {
           propagation.lb_skipped_md_steps = 0;
           lb.propagate();
         }
+#ifdef CALIPER
+        CALI_MARK_END("LB.PROPAGATE");
+#endif
       } else if (ek_active) {
         auto const md_steps_per_ek_step = calc_md_steps_per_tau(ek.get_tau());
         propagation.ek_skipped_md_steps += 1;
diff --git a/src/core/lb/LBWalberla.cpp b/src/core/lb/LBWalberla.cpp
index 9944d05408..37f3d78e64 100644
--- a/src/core/lb/LBWalberla.cpp
+++ b/src/core/lb/LBWalberla.cpp
@@ -40,6 +40,10 @@
 #include <optional>
 #include <variant>
 
+#ifdef CALIPER
+#include <caliper/cali.h>
+#endif
+
 namespace LB {
 
 bool LBWalberla::is_gpu() const { return lb_fluid->is_gpu(); }
@@ -50,7 +54,15 @@ Utils::VectorXd<9> LBWalberla::get_pressure_tensor() const {
   return lb_fluid->get_pressure_tensor();
 }
 
-void LBWalberla::propagate() { lb_fluid->integrate(); }
+void LBWalberla::propagate() {
+#ifdef CALIPER
+  CALI_MARK_BEGIN("LBWalberla.PROPAGATE");
+#endif
+  lb_fluid->integrate();
+#ifdef CALIPER
+  CALI_MARK_END("LBWalberla.PROPAGATE");
+#endif
+}
 
 void LBWalberla::ghost_communication() { lb_fluid->ghost_communication(); }
 
diff --git a/src/core/lb/Solver.cpp b/src/core/lb/Solver.cpp
index 758f36c4d7..9a75558057 100644
--- a/src/core/lb/Solver.cpp
+++ b/src/core/lb/Solver.cpp
@@ -47,6 +47,10 @@
 #include <variant>
 #include <vector>
 
+#ifdef CALIPER
+#include <caliper/cali.h>
+#endif
+
 namespace LB {
 
 Solver::Solver() { impl = std::make_unique<Implementation>(); }
@@ -69,8 +73,14 @@ void Solver::reset() {
 }
 
 void Solver::propagate() {
+#ifdef CALIPER
+  CALI_MARK_BEGIN("SOLVER.PROPAGATE");
+#endif
   check_solver(impl);
   std::visit([](auto &ptr) { ptr->propagate(); }, *impl->solver);
+#ifdef CALIPER
+  CALI_MARK_END("SOLVER.PROPAGATE");
+#endif
 }
 
 void Solver::ghost_communication() {
diff --git a/src/core/unit_tests/ek_interface_test.cpp b/src/core/unit_tests/ek_interface_test.cpp
index 0abe8917bd..a80b2fa2fa 100644
--- a/src/core/unit_tests/ek_interface_test.cpp
+++ b/src/core/unit_tests/ek_interface_test.cpp
@@ -83,7 +83,8 @@ static auto make_ek_actor() {
   auto constexpr n_ghost_layers = 1u;
   auto constexpr single_precision = true;
   ek_lattice = std::make_shared<LatticeWalberla>(
-      params.grid_dimensions, ::communicator.node_grid, n_ghost_layers);
+      params.grid_dimensions, ::communicator.node_grid,
+      ::communicator.node_grid, n_ghost_layers);
   ek_container = std::make_shared<EK::EKWalberla::ek_container_type>(
       params.tau, walberla::new_ek_poisson_none(ek_lattice, single_precision));
   ek_reactions = std::make_shared<EK::EKWalberla::ek_reactions_type>();
diff --git a/src/core/unit_tests/lb_particle_coupling_test.cpp b/src/core/unit_tests/lb_particle_coupling_test.cpp
index 97e0f4c2e8..4b6c875360 100644
--- a/src/core/unit_tests/lb_particle_coupling_test.cpp
+++ b/src/core/unit_tests/lb_particle_coupling_test.cpp
@@ -102,7 +102,8 @@ static auto make_lb_actor() {
   auto constexpr single_precision = false;
   lb_params = std::make_shared<LB::LBWalberlaParams>(params.agrid, params.tau);
   lb_lattice = std::make_shared<LatticeWalberla>(
-      params.grid_dimensions, ::communicator.node_grid, n_ghost_layers);
+      params.grid_dimensions, ::communicator.node_grid,
+      ::communicator.node_grid, n_ghost_layers);
   lb_fluid = new_lb_walberla_cpu(lb_lattice, params.viscosity, params.density,
                                  single_precision);
   lb_fluid->set_collision_model(params.kT, params.seed);
@@ -535,7 +536,8 @@ bool test_lb_domain_mismatch_local() {
   auto const params = std::make_shared<LB::LBWalberlaParams>(0.5, 0.01);
   ::communicator.node_grid = node_grid_reversed;
   auto const lattice = std::make_shared<LatticeWalberla>(
-      Utils::Vector3i{12, 12, 12}, node_grid_original, n_ghost_layers);
+      Utils::Vector3i{12, 12, 12}, node_grid_original, node_grid_original,
+      n_ghost_layers);
   auto const ptr = new_lb_walberla_cpu(lattice, 1.0, 1.0, false);
   ptr->set_collision_model(0.0, 0);
   ::communicator.node_grid = node_grid_original;
diff --git a/src/python/espressomd/detail/walberla.py b/src/python/espressomd/detail/walberla.py
index 6ec64dc94a..5a6c9a97b9 100644
--- a/src/python/espressomd/detail/walberla.py
+++ b/src/python/espressomd/detail/walberla.py
@@ -47,13 +47,13 @@ def __init__(self, *args, **kwargs):
             super().__init__(**kwargs)
 
     def valid_keys(self):
-        return {"agrid", "n_ghost_layers"}
+        return {"agrid", "n_ghost_layers", "blocks_per_mpi_rank"}
 
     def required_keys(self):
         return self.valid_keys()
 
     def default_params(self):
-        return {}
+        return {"blocks_per_mpi_rank": [1, 1, 1]}
 
     def get_node_indices_inside_shape(self, shape):
         if not isinstance(shape, espressomd.shapes.Shape):
diff --git a/src/python/espressomd/lb.py b/src/python/espressomd/lb.py
index e4b870a307..8f3cc05631 100644
--- a/src/python/espressomd/lb.py
+++ b/src/python/espressomd/lb.py
@@ -58,14 +58,14 @@ def validate_params(self, params):
 
     def valid_keys(self):
         return {"agrid", "tau", "density", "ext_force_density",
-                "kinematic_viscosity", "lattice", "kT", "seed"}
+                "kinematic_viscosity", "lattice", "kT", "seed", "blocks_per_mpi_rank"}
 
     def required_keys(self):
         return {"lattice", "density", "kinematic_viscosity", "tau"}
 
     def default_params(self):
         return {"lattice": None, "seed": 0, "kT": 0.,
-                "ext_force_density": [0.0, 0.0, 0.0]}
+                "ext_force_density": [0.0, 0.0, 0.0], "blocks_per_mpi_rank": [1, 1, 1]}
 
     def mach_limit(self):
         """
@@ -141,6 +141,8 @@ class LBFluidWalberla(HydrodynamicInteraction,
         Required for a thermalized fluid. Must be positive.
     single_precision : :obj:`bool`, optional
         Use single-precision floating-point arithmetic.
+    blocks_per_mpi_rank : (3,) array_like of :obj:`int`, optional
+        Distribute more than one block to each CPU.
 
     Methods
     -------
@@ -240,7 +242,7 @@ def validate_params(self, params):
             if "agrid" not in params:
                 raise ValueError("missing argument 'lattice' or 'agrid'")
             params["lattice"] = LatticeWalberla(
-                agrid=params.pop("agrid"), n_ghost_layers=1)
+                agrid=params.pop("agrid"), n_ghost_layers=1, blocks_per_mpi_rank=params.pop("blocks_per_mpi_rank"))
         elif "agrid" in params:
             raise ValueError("cannot provide both 'lattice' and 'agrid'")
 
diff --git a/src/script_interface/walberla/LBFluid.cpp b/src/script_interface/walberla/LBFluid.cpp
index 5b3bf4cabc..0ad6ab0ed4 100644
--- a/src/script_interface/walberla/LBFluid.cpp
+++ b/src/script_interface/walberla/LBFluid.cpp
@@ -139,6 +139,12 @@ void LBFluidGPU::make_instance(VariantMap const &params) {
   auto const visc = get_value<double>(params, "kinematic_viscosity");
   auto const dens = get_value<double>(params, "density");
   auto const precision = get_value<bool>(params, "single_precision");
+  auto const blocks_per_mpi_rank = get_value<Utils::Vector3i>(
+      m_lattice->get_parameter("blocks_per_mpi_rank"));
+  if (blocks_per_mpi_rank != Utils::Vector3i{{1, 1, 1}}) {
+    throw std::runtime_error(
+        "Using more than one block per MPI rank is not supported for GPU LB");
+  }
   auto const lb_lattice = m_lattice->lattice();
   auto const lb_visc = m_conv_visc * visc;
   auto const lb_dens = m_conv_dens * dens;
diff --git a/src/script_interface/walberla/LatticeWalberla.hpp b/src/script_interface/walberla/LatticeWalberla.hpp
index 999513f0a7..7208abdede 100644
--- a/src/script_interface/walberla/LatticeWalberla.hpp
+++ b/src/script_interface/walberla/LatticeWalberla.hpp
@@ -43,6 +43,7 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
   std::shared_ptr<::LatticeWalberla> m_lattice;
   double m_agrid;
   Utils::Vector3d m_box_l;
+  Utils::Vector3i m_blocks_per_mpi_rank;
 
 public:
   LatticeWalberla() {
@@ -53,6 +54,8 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
         {"shape", AutoParameter::read_only,
          [this]() { return m_lattice->get_grid_dimensions(); }},
         {"_box_l", AutoParameter::read_only, [this]() { return m_box_l; }},
+        {"blocks_per_mpi_rank", AutoParameter::read_only,
+         [this]() { return m_blocks_per_mpi_rank; }},
     });
   }
 
@@ -60,8 +63,11 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
     auto const &box_geo = *::System::get_system().box_geo;
     m_agrid = get_value<double>(args, "agrid");
     m_box_l = get_value_or<Utils::Vector3d>(args, "_box_l", box_geo.length());
+    m_blocks_per_mpi_rank =
+        get_value<Utils::Vector3i>(args, "blocks_per_mpi_rank");
     auto const n_ghost_layers = get_value<int>(args, "n_ghost_layers");
-
+    auto const block_grid = Utils::hadamard_product(::communicator.node_grid,
+                                                    m_blocks_per_mpi_rank);
     context()->parallel_try_catch([&]() {
       if (m_agrid <= 0.) {
         throw std::domain_error("Parameter 'agrid' must be > 0");
@@ -72,7 +78,7 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
       auto const grid_dim =
           ::LatticeWalberla::calc_grid_dimensions(m_box_l, m_agrid);
       m_lattice = std::make_shared<::LatticeWalberla>(
-          grid_dim, ::communicator.node_grid,
+          grid_dim, ::communicator.node_grid, block_grid,
           static_cast<unsigned int>(n_ghost_layers));
     });
   }
diff --git a/src/walberla_bridge/include/walberla_bridge/LatticeWalberla.hpp b/src/walberla_bridge/include/walberla_bridge/LatticeWalberla.hpp
index 03c5ff6291..b49693e848 100644
--- a/src/walberla_bridge/include/walberla_bridge/LatticeWalberla.hpp
+++ b/src/walberla_bridge/include/walberla_bridge/LatticeWalberla.hpp
@@ -52,6 +52,7 @@ class LatticeWalberla {
 public:
   LatticeWalberla(Utils::Vector3i const &grid_dimensions,
                   Utils::Vector3i const &node_grid,
+                  Utils::Vector3i const &block_grid,
                   unsigned int n_ghost_layers);
 
   // Grid, domain, halo
diff --git a/src/walberla_bridge/src/BoundaryPackInfo.hpp b/src/walberla_bridge/src/BoundaryPackInfo.hpp
index 83a26fa91d..48e3d4258c 100644
--- a/src/walberla_bridge/src/BoundaryPackInfo.hpp
+++ b/src/walberla_bridge/src/BoundaryPackInfo.hpp
@@ -96,7 +96,7 @@ class BoundaryPackInfo : public PackInfo<GhostLayerField_T> {
     WALBERLA_ASSERT_EQUAL(bSize, buf_size);
 #endif
 
-    auto const offset = std::get<0>(m_lattice->get_local_grid_range());
+    auto const offset = to_vector3i(receiver->getAABB().min());
     typename Boundary_T::value_type value;
     for (auto it = begin(flag_field); it != flag_field->end(); ++it) {
       if (isFlagSet(it, boundary_flag)) {
@@ -133,7 +133,7 @@ class BoundaryPackInfo : public PackInfo<GhostLayerField_T> {
            << buf_size;
 #endif
 
-    auto const offset = std::get<0>(m_lattice->get_local_grid_range());
+    auto const offset = to_vector3i(sender->getAABB().min());
     for (auto it = begin(flag_field); it != flag_field->end(); ++it) {
       if (isFlagSet(it, boundary_flag)) {
         auto const node = offset + Utils::Vector3i{{it.x(), it.y(), it.z()}};
diff --git a/src/walberla_bridge/src/LatticeWalberla.cpp b/src/walberla_bridge/src/LatticeWalberla.cpp
index 2dc2943a40..981c7a004a 100644
--- a/src/walberla_bridge/src/LatticeWalberla.cpp
+++ b/src/walberla_bridge/src/LatticeWalberla.cpp
@@ -40,6 +40,7 @@
 
 LatticeWalberla::LatticeWalberla(Utils::Vector3i const &grid_dimensions,
                                  Utils::Vector3i const &node_grid,
+                                 Utils::Vector3i const &block_grid,
                                  unsigned int n_ghost_layers)
     : m_grid_dimensions{grid_dimensions}, m_n_ghost_layers{n_ghost_layers} {
   using walberla::real_t;
@@ -50,21 +51,28 @@ LatticeWalberla::LatticeWalberla(Utils::Vector3i const &grid_dimensions,
       throw std::runtime_error(
           "Lattice grid dimensions and MPI node grid are not compatible.");
     }
+    if (m_grid_dimensions[i] % block_grid[i] != 0) {
+      throw std::runtime_error(
+          "Lattice grid dimensions and block grid are not compatible.");
+    }
   }
 
   auto constexpr lattice_constant = real_t{1};
-  auto const cells_block = Utils::hadamard_division(grid_dimensions, node_grid);
+  auto const cells_per_block =
+      Utils::hadamard_division(grid_dimensions, block_grid);
 
   m_blocks = walberla::blockforest::createUniformBlockGrid(
       // number of blocks in each direction
-      uint_c(node_grid[0]), uint_c(node_grid[1]), uint_c(node_grid[2]),
+      uint_c(block_grid[0]), uint_c(block_grid[1]), uint_c(block_grid[2]),
       // number of cells per block in each direction
-      uint_c(cells_block[0]), uint_c(cells_block[1]), uint_c(cells_block[2]),
-      lattice_constant,
+      uint_c(cells_per_block[0]), uint_c(cells_per_block[1]),
+      uint_c(cells_per_block[2]), lattice_constant,
       // number of cpus per direction
       uint_c(node_grid[0]), uint_c(node_grid[1]), uint_c(node_grid[2]),
       // periodicity
-      true, true, true);
+      true, true, true,
+      // keep global block information
+      false);
   for (IBlock &block : *m_blocks) {
     m_cached_blocks.push_back(&block);
   }
@@ -73,11 +81,19 @@ LatticeWalberla::LatticeWalberla(Utils::Vector3i const &grid_dimensions,
 [[nodiscard]] std::pair<Utils::Vector3d, Utils::Vector3d>
 LatticeWalberla::get_local_domain() const {
   using walberla::to_vector3d;
-  // We only have one block per mpi rank
-  assert(++(m_blocks->begin()) == m_blocks->end());
-
-  auto const ab = m_blocks->begin()->getAABB();
-  return {to_vector3d(ab.min()), to_vector3d(ab.max())};
+  // Get upper and lower corner of BlockForest assigned to a mpi rank.
+  // Since we can allocate multiple blocks per mpi rank,
+  // the corners of all Blocks are compared.
+  auto aa = to_vector3d(m_blocks->begin()->getAABB().min());
+  auto bb = to_vector3d(m_blocks->begin()->getAABB().max());
+  for (auto b = m_blocks->begin(); b != m_blocks->end(); ++b) {
+    auto cc = b->getAABB();
+    for (auto const i : {0u, 1u, 2u}) {
+      aa[i] = std::min(aa[i], cc.min()[i]);
+      bb[i] = std::max(bb[i], cc.max()[i]);
+    }
+  }
+  return {aa, bb};
 }
 
 [[nodiscard]] bool
diff --git a/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp b/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
index 3c1b11219f..34d33d25f6 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
@@ -63,6 +63,7 @@
 #include <walberla_bridge/lattice_boltzmann/LeesEdwardsPack.hpp>
 
 #include <utils/Vector.hpp>
+#include <utils/index.hpp>
 #include <utils/interpolation/bspline_3d.hpp>
 #include <utils/math/make_lin_space.hpp>
 
@@ -378,6 +379,7 @@ class LBWalberlaImpl : public LBWalberlaBase {
   std::shared_ptr<gpu::HostFieldAllocator<FloatType>> m_host_field_allocator;
 #endif
 
+  // Interval within not global but mpi rank
   [[nodiscard]] std::optional<CellInterval>
   get_interval(Utils::Vector3i const &lower_corner,
                Utils::Vector3i const &upper_corner) const {
@@ -389,8 +391,46 @@ class LBWalberlaImpl : public LBWalberlaBase {
     if (not lower_bc or not upper_bc) {
       return std::nullopt;
     }
-    assert(&(*(lower_bc->block)) == &(*(upper_bc->block)));
-    return {CellInterval(lower_bc->cell, upper_bc->cell)};
+    auto const global_lower_cell = lower_bc->cell;
+    auto const global_upper_cell = Cell(
+        upper_bc->cell[0] +
+            static_cast<int>(std::round(upper_bc->block->getAABB().min()[0] -
+                                        lower_bc->block->getAABB().min()[0])),
+        upper_bc->cell[1] +
+            static_cast<int>(std::round(upper_bc->block->getAABB().min()[1] -
+                                        lower_bc->block->getAABB().min()[1])),
+        upper_bc->cell[2] +
+            static_cast<int>(std::round(upper_bc->block->getAABB().min()[2] -
+                                        lower_bc->block->getAABB().min()[2])));
+    return CellInterval(global_lower_cell, global_upper_cell);
+  }
+
+  // Interval within local block
+  [[nodiscard]] std::optional<CellInterval> get_block_interval(
+      Utils::Vector3i const &lower_corner, Utils::Vector3i const &upper_corner,
+      Utils::Vector3i const &block_offset, IBlock const &block) const {
+    auto block_lower_corner = to_vector3i(block.getAABB().min());
+    if (not(upper_corner > block_lower_corner)) {
+      return std::nullopt;
+    }
+    for (uint_t f = 0u; f < 3u; ++f) {
+      block_lower_corner[f] = std::max(block_lower_corner[f], lower_corner[f]);
+    }
+    auto block_upper_corner = to_vector3i(block.getAABB().max());
+    if (not(block_upper_corner > lower_corner)) {
+      return std::nullopt;
+    }
+    for (uint_t f = 0u; f < 3u; ++f) {
+      block_upper_corner[f] = std::min(block_upper_corner[f], upper_corner[f]);
+    }
+    block_upper_corner -= Utils::Vector3i::broadcast(1);
+    auto const block_lower_cell = Cell(block_lower_corner[0] - block_offset[0],
+                                       block_lower_corner[1] - block_offset[1],
+                                       block_lower_corner[2] - block_offset[2]);
+    auto const block_upper_cell = Cell(block_upper_corner[0] - block_offset[0],
+                                       block_upper_corner[1] - block_offset[1],
+                                       block_upper_corner[2] - block_offset[2]);
+    return CellInterval(block_lower_cell, block_upper_cell);
   }
 
   /**
@@ -614,6 +654,7 @@ class LBWalberlaImpl : public LBWalberlaBase {
     if (m_has_boundaries) {
       integrate_boundaries(blocks);
     }
+
     // LB stream
     integrate_stream(blocks);
     // Mark pending ghost layer updates
@@ -772,6 +813,11 @@ class LBWalberlaImpl : public LBWalberlaBase {
     auto const &lattice = get_lattice();
     auto const n_ghost_layers = lattice.get_ghost_layers();
     auto const blocks = lattice.get_blocks();
+    if ((shear_direction == 0u and blocks->getXSize() != 1u) or
+        (shear_direction == 2u and blocks->getZSize() != 1u)) {
+      throw std::domain_error("Lees-Edwards LB doesn't support domain "
+                              "decomposition along the shear direction.");
+    }
     auto const agrid =
         FloatType_c(lattice.get_grid_dimensions()[shear_plane_normal]);
     auto obj = CollisionModelLeesEdwards(
@@ -864,44 +910,83 @@ class LBWalberlaImpl : public LBWalberlaBase {
     return true;
   }
 
+  template <typename F>
+  void mapping_block_to_local(std::optional<CellInterval> const &bci,
+                              std::optional<CellInterval> const &ci,
+                              Utils::Vector3i const &block_offset,
+                              Utils::Vector3i const &lower_corner,
+                              F &&func) const {
+    auto const local_grid = Utils::Vector3i{
+        {ci->max().x() - ci->min().x() + 1, ci->max().y() - ci->min().y() + 1,
+         ci->max().z() - ci->min().z() + 1}};
+    auto const block_grid =
+        Utils::Vector3i{{bci->max().x() - bci->min().x() + 1,
+                         bci->max().y() - bci->min().y() + 1,
+                         bci->max().z() - bci->min().z() + 1}};
+    auto const lower_cell = bci->min();
+    auto const upper_cell = bci->max();
+    // In the loop, x,y,z are in block coordinates
+    // The field data given in the argument knows about BlockForest
+    // (lattice) indices from lower_corner to upper_corneri. It is converted
+    // to block coordinates
+    for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
+      for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
+        for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
+          auto const node = block_offset + Utils::Vector3i{{x, y, z}};
+          auto const local_index = Utils::get_linear_index(
+              node[0] - lower_corner[0], node[1] - lower_corner[1],
+              node[2] - lower_corner[2], local_grid,
+              Utils::MemoryOrder::ROW_MAJOR);
+          auto const block_index = Utils::get_linear_index(
+              x - lower_cell.x(), y - lower_cell.y(), z - lower_cell.z(),
+              block_grid, Utils::MemoryOrder::ROW_MAJOR);
+          func(block_index, local_index, node);
+        }
+      }
+    }
+  }
+
   std::vector<double>
   get_slice_velocity(Utils::Vector3i const &lower_corner,
                      Utils::Vector3i const &upper_corner) const override {
     std::vector<double> out;
+    uint_t values_size = 0;
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
+      out = std::vector<double>(static_cast<unsigned int>(3u * ci->numCells()));
       auto const &lattice = get_lattice();
-      auto const &block = *(lattice.get_blocks()->begin());
-      auto const field =
-          block.template getData<VectorField>(m_velocity_field_id);
-      auto const values = lbm::accessor::Vector::get(field, *ci);
-      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
-      assert(values.size() == 3u * ci->numCells());
-      if constexpr (std::is_same_v<typename decltype(values)::value_type,
-                                   double>) {
-        out = std::move(values);
-      } else {
-        out = std::vector<double>(values.begin(), values.end());
-      }
-      auto const local_offset = std::get<0>(lattice.get_local_grid_range());
-      auto const lower_cell = ci->min();
-      auto const upper_cell = ci->max();
-      auto it = out.begin();
-      for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
-        for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
-          for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
-            auto const node = local_offset + Utils::Vector3i{{x, y, z}};
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto const &block = *b;
+        auto const block_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                block_offset, block)) {
+          auto const field =
+              block.template getData<VectorField>(m_velocity_field_id);
+          auto const values = lbm::accessor::Vector::get(field, *bci);
+          assert(values.size() == 3u * bci->numCells());
+          values_size += 3u * bci->numCells();
+
+          auto func = [&values, &out, this](uint_t block_index,
+                                            uint_t local_index,
+                                            Utils::Vector3i node) {
             if (m_boundary->node_is_boundary(node)) {
               auto const &vec = m_boundary->get_node_value_at_boundary(node);
               for (uint_t f = 0u; f < 3u; ++f) {
-                (*it) = double_c(vec[f]);
-                std::advance(it, 1l);
+                out[static_cast<unsigned int>(3u * local_index + f)] =
+                    double_c(vec[f]);
               }
             } else {
-              std::advance(it, 3l);
+              for (uint_t f = 0u; f < 3u; ++f) {
+                out[static_cast<unsigned int>(3u * local_index + f)] = double_c(
+                    values[static_cast<unsigned int>(3u * block_index + f)]);
+              }
             }
-          }
+          };
+
+          mapping_block_to_local(bci, ci, block_offset, lower_corner, func);
         }
       }
+      assert(values_size == 3u * ci->numCells());
     }
     return out;
   }
@@ -912,17 +997,37 @@ class LBWalberlaImpl : public LBWalberlaBase {
     m_pending_ghost_comm.set(GhostComm::PDF);
     m_pending_ghost_comm.set(GhostComm::VEL);
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
-      auto const &lattice = get_lattice();
-      auto &block = *(lattice.get_blocks()->begin());
-      auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
-      auto force_field =
-          block.template getData<VectorField>(m_last_applied_force_field_id);
-      auto vel_field = block.template getData<VectorField>(m_velocity_field_id);
-      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
       assert(velocity.size() == 3u * ci->numCells());
-      std::vector<FloatType> const values(velocity.begin(), velocity.end());
-      lbm::accessor::Velocity::set(pdf_field, vel_field, force_field, values,
-                                   *ci);
+      auto const &lattice = get_lattice();
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto &block = *b;
+        auto const block_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                block_offset, block)) {
+          auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
+          auto force_field = block.template getData<VectorField>(
+              m_last_applied_force_field_id);
+          auto vel_field =
+              block.template getData<VectorField>(m_velocity_field_id);
+          std::vector<FloatType> values = std::vector<FloatType>(
+              static_cast<unsigned int>(3u * bci->numCells()));
+
+          auto func = [&values, &velocity](uint_t block_index,
+                                           uint_t local_index,
+                                           Utils::Vector3i node) {
+            for (uint_t f = 0u; f < 3u; ++f) {
+              values[static_cast<unsigned int>(3u * block_index + f)] =
+                  numeric_cast<FloatType>(velocity[static_cast<unsigned int>(
+                      3u * local_index + f)]);
+            }
+          };
+
+          mapping_block_to_local(bci, ci, block_offset, lower_corner, func);
+          lbm::accessor::Velocity::set(pdf_field, vel_field, force_field,
+                                       values, *bci);
+        }
+      }
     }
   }
 
@@ -1073,8 +1178,11 @@ class LBWalberlaImpl : public LBWalberlaBase {
       return false;
     auto const force_at_node = [this, &force](std::array<int, 3> const node,
                                               double weight) {
-      auto const bc =
-          get_block_and_cell(get_lattice(), Utils::Vector3i(node), true);
+      auto bc = get_block_and_cell(get_lattice(), Utils::Vector3i(node), false);
+      if (!bc) {
+        bc = get_block_and_cell(get_lattice(), Utils::Vector3i(node), true);
+      }
+
       if (bc) {
         auto const weighted_force = to_vector3<FloatType>(weight * force);
         auto force_field =
@@ -1137,18 +1245,29 @@ class LBWalberlaImpl : public LBWalberlaBase {
       Utils::Vector3i const &upper_corner) const override {
     std::vector<double> out;
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
+      out = std::vector<double>(static_cast<unsigned int>(3u * ci->numCells()));
       auto const &lattice = get_lattice();
-      auto const &block = *(lattice.get_blocks()->begin());
-      auto const field =
-          block.template getData<VectorField>(m_last_applied_force_field_id);
-      auto const values = lbm::accessor::Vector::get(field, *ci);
-      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
-      assert(values.size() == 3u * ci->numCells());
-      if constexpr (std::is_same_v<typename decltype(values)::value_type,
-                                   double>) {
-        out = std::move(values);
-      } else {
-        out = std::vector<double>(values.begin(), values.end());
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto const &block = *b;
+        auto const block_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                block_offset, block)) {
+          auto const field = block.template getData<VectorField>(
+              m_last_applied_force_field_id);
+          auto const values = lbm::accessor::Vector::get(field, *bci);
+          assert(values.size() == 3u * bci->numCells());
+
+          auto func = [&values, &out](uint_t block_index, uint_t local_index,
+                                      Utils::Vector3i node) {
+            for (uint_t f = 0u; f < 3u; ++f) {
+              out[static_cast<unsigned int>(3u * local_index + f)] =
+                  values[static_cast<unsigned int>(3u * block_index + f)];
+            }
+          };
+
+          mapping_block_to_local(bci, ci, block_offset, lower_corner, func);
+        }
       }
     }
     return out;
@@ -1160,16 +1279,36 @@ class LBWalberlaImpl : public LBWalberlaBase {
     m_pending_ghost_comm.set(GhostComm::VEL);
     m_pending_ghost_comm.set(GhostComm::LAF);
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
-      auto const &lattice = get_lattice();
-      auto &block = *(lattice.get_blocks()->begin());
-      auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
-      auto force_field =
-          block.template getData<VectorField>(m_last_applied_force_field_id);
-      auto vel_field = block.template getData<VectorField>(m_velocity_field_id);
-      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
       assert(force.size() == 3u * ci->numCells());
-      std::vector<FloatType> const values(force.begin(), force.end());
-      lbm::accessor::Force::set(pdf_field, vel_field, force_field, values, *ci);
+      auto const &lattice = get_lattice();
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto &block = *b;
+        auto const block_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                block_offset, block)) {
+          auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
+          auto force_field = block.template getData<VectorField>(
+              m_last_applied_force_field_id);
+          auto vel_field =
+              block.template getData<VectorField>(m_velocity_field_id);
+          std::vector<FloatType> values = std::vector<FloatType>(
+              static_cast<unsigned int>(3u * bci->numCells()));
+
+          auto func = [&values, &force](uint_t block_index, uint_t local_index,
+                                        Utils::Vector3i node) {
+            for (uint_t f = 0u; f < 3u; ++f) {
+              values[static_cast<unsigned int>(3u * block_index + f)] =
+                  numeric_cast<FloatType>(
+                      force[static_cast<unsigned int>(3u * local_index + f)]);
+            }
+          };
+
+          mapping_block_to_local(bci, ci, block_offset, lower_corner, func);
+          lbm::accessor::Force::set(pdf_field, vel_field, force_field, values,
+                                    *bci);
+        }
+      }
     }
   }
 
@@ -1220,17 +1359,32 @@ class LBWalberlaImpl : public LBWalberlaBase {
                        Utils::Vector3i const &upper_corner) const override {
     std::vector<double> out;
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
+      out = std::vector<double>(
+          static_cast<unsigned int>(stencil_size() * ci->numCells()));
       auto const &lattice = get_lattice();
-      auto const &block = *(lattice.get_blocks()->begin());
-      auto const pdf_field = block.template getData<PdfField>(m_pdf_field_id);
-      auto const values = lbm::accessor::Population::get(pdf_field, *ci);
-      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
-      assert(values.size() == stencil_size() * ci->numCells());
-      if constexpr (std::is_same_v<typename decltype(values)::value_type,
-                                   double>) {
-        out = std::move(values);
-      } else {
-        out = std::vector<double>(values.begin(), values.end());
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto const &block = *b;
+        auto const block_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                block_offset, block)) {
+          auto const pdf_field =
+              block.template getData<PdfField>(m_pdf_field_id);
+          auto const values = lbm::accessor::Population::get(pdf_field, *bci);
+          assert(values.size() == stencil_size() * bci->numCells());
+
+          auto func = [&values, &out, this](uint_t block_index,
+                                            uint_t local_index,
+                                            Utils::Vector3i node) {
+            for (uint_t f = 0u; f < stencil_size(); ++f) {
+              out[static_cast<unsigned int>(stencil_size() * local_index + f)] =
+                  values[static_cast<unsigned int>(
+                      stencil_size() * block_index + f)];
+            }
+          };
+
+          mapping_block_to_local(bci, ci, block_offset, lower_corner, func);
+        }
       }
     }
     return out;
@@ -1240,17 +1394,38 @@ class LBWalberlaImpl : public LBWalberlaBase {
                             Utils::Vector3i const &upper_corner,
                             std::vector<double> const &population) override {
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
-      auto const &lattice = get_lattice();
-      auto &block = *(lattice.get_blocks()->begin());
-      auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
-      auto force_field =
-          block.template getData<VectorField>(m_last_applied_force_field_id);
-      auto vel_field = block.template getData<VectorField>(m_velocity_field_id);
       assert(population.size() == stencil_size() * ci->numCells());
-      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
-      std::vector<FloatType> const values(population.begin(), population.end());
-      lbm::accessor::Population::set(pdf_field, vel_field, force_field, values,
-                                     *ci);
+      auto const &lattice = get_lattice();
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto &block = *b;
+        auto const block_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                block_offset, block)) {
+          auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
+          auto force_field = block.template getData<VectorField>(
+              m_last_applied_force_field_id);
+          auto vel_field =
+              block.template getData<VectorField>(m_velocity_field_id);
+          std::vector<FloatType> values = std::vector<FloatType>(
+              static_cast<unsigned int>(stencil_size() * bci->numCells()));
+
+          auto func = [&values, &population, this](uint_t block_index,
+                                                   uint_t local_index,
+                                                   Utils::Vector3i node) {
+            for (uint_t f = 0u; f < stencil_size(); ++f) {
+              values[static_cast<unsigned int>(stencil_size() * block_index +
+                                               f)] =
+                  numeric_cast<FloatType>(population[static_cast<unsigned int>(
+                      stencil_size() * local_index + f)]);
+            }
+          };
+
+          mapping_block_to_local(bci, ci, block_offset, lower_corner, func);
+          lbm::accessor::Population::set(pdf_field, vel_field, force_field,
+                                         values, *bci);
+        }
+      }
     }
   }
 
@@ -1286,17 +1461,26 @@ class LBWalberlaImpl : public LBWalberlaBase {
                     Utils::Vector3i const &upper_corner) const override {
     std::vector<double> out;
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
+      out = std::vector<double>(ci->numCells());
       auto const &lattice = get_lattice();
-      auto const &block = *(lattice.get_blocks()->begin());
-      auto const pdf_field = block.template getData<PdfField>(m_pdf_field_id);
-      auto const values = lbm::accessor::Density::get(pdf_field, *ci);
-      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
-      assert(values.size() == ci->numCells());
-      if constexpr (std::is_same_v<typename decltype(values)::value_type,
-                                   double>) {
-        out = std::move(values);
-      } else {
-        out = std::vector<double>(values.begin(), values.end());
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto const &block = *b;
+        auto const block_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                block_offset, block)) {
+          auto const pdf_field =
+              block.template getData<PdfField>(m_pdf_field_id);
+          auto const values = lbm::accessor::Density::get(pdf_field, *bci);
+          assert(values.size() == bci->numCells());
+
+          auto func = [&values, &out](uint_t block_index, uint_t local_index,
+                                      Utils::Vector3i node) {
+            out[local_index] = values[block_index];
+          };
+
+          mapping_block_to_local(bci, ci, block_offset, lower_corner, func);
+        }
       }
     }
     return out;
@@ -1307,13 +1491,28 @@ class LBWalberlaImpl : public LBWalberlaBase {
                          std::vector<double> const &density) override {
     m_pending_ghost_comm.set(GhostComm::PDF);
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
-      auto const &lattice = get_lattice();
-      auto &block = *(lattice.get_blocks()->begin());
-      auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
       assert(density.size() == ci->numCells());
-      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
-      std::vector<FloatType> const values(density.begin(), density.end());
-      lbm::accessor::Density::set(pdf_field, values, *ci);
+      auto const &lattice = get_lattice();
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto &block = *b;
+        auto const block_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                block_offset, block)) {
+          auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
+          std::vector<FloatType> values =
+              std::vector<FloatType>(bci->numCells());
+
+          auto func = [&values, &density](uint_t block_index,
+                                          uint_t local_index,
+                                          Utils::Vector3i node) {
+            values[block_index] = numeric_cast<FloatType>(density[local_index]);
+          };
+
+          mapping_block_to_local(bci, ci, block_offset, lower_corner, func);
+          lbm::accessor::Density::set(pdf_field, values, *bci);
+        }
+      }
     }
   }
 
@@ -1332,7 +1531,10 @@ class LBWalberlaImpl : public LBWalberlaBase {
                                      Utils::Vector3d const &velocity) override {
     on_boundary_add();
     m_pending_ghost_comm.set(GhostComm::UBB);
-    auto bc = get_block_and_cell(get_lattice(), node, true);
+    auto bc = get_block_and_cell(get_lattice(), node, false);
+    if (!bc) {
+      bc = get_block_and_cell(get_lattice(), node, true);
+    }
     if (bc) {
       m_boundary->set_node_value_at_boundary(
           node, to_vector3<FloatType>(velocity), *bc);
@@ -1345,26 +1547,29 @@ class LBWalberlaImpl : public LBWalberlaBase {
       Utils::Vector3i const &upper_corner) const override {
     std::vector<std::optional<Utils::Vector3d>> out;
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
+      out = std::vector<std::optional<Utils::Vector3d>>(ci->numCells());
       auto const &lattice = get_lattice();
-      auto const local_offset = std::get<0>(lattice.get_local_grid_range());
-      auto const lower_cell = ci->min();
-      auto const upper_cell = ci->max();
-      auto const n_values = ci->numCells();
-      out.reserve(n_values);
-      for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
-        for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
-          for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
-            auto const node = local_offset + Utils::Vector3i{{x, y, z}};
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto const &block = *b;
+        auto const block_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                block_offset, block)) {
+
+          auto func = [&out, this](uint_t block_index, uint_t local_index,
+                                   Utils::Vector3i node) {
             if (m_boundary->node_is_boundary(node)) {
-              out.emplace_back(
-                  to_vector3d(m_boundary->get_node_value_at_boundary(node)));
+              out[local_index] =
+                  to_vector3d(m_boundary->get_node_value_at_boundary(node));
             } else {
-              out.emplace_back(std::nullopt);
+              out[local_index] = std::nullopt;
             }
-          }
+          };
+
+          mapping_block_to_local(bci, ci, block_offset, lower_corner, func);
         }
       }
-      assert(out.size() == n_values);
+      assert(out.size() == ci->numCells());
     }
     return out;
   }
@@ -1375,26 +1580,30 @@ class LBWalberlaImpl : public LBWalberlaBase {
     on_boundary_add();
     m_pending_ghost_comm.set(GhostComm::UBB);
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
-      auto const &lattice = get_lattice();
-      auto const local_offset = std::get<0>(lattice.get_local_grid_range());
-      auto const lower_cell = ci->min();
-      auto const upper_cell = ci->max();
-      auto it = velocity.begin();
       assert(velocity.size() == ci->numCells());
-      for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
-        for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
-          for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
-            auto const node = local_offset + Utils::Vector3i{{x, y, z}};
+      auto const &lattice = get_lattice();
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto &block = *b;
+        auto const block_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                block_offset, block)) {
+
+          auto func = [&lattice, &block, &velocity,
+                       this](uint_t block_index, uint_t local_index,
+                             Utils::Vector3i node) {
             auto const bc = get_block_and_cell(lattice, node, false);
-            auto const &opt = *it;
+            assert(bc->block->getAABB() == block.getAABB());
+            auto const &opt = velocity[local_index];
             if (opt) {
               m_boundary->set_node_value_at_boundary(
                   node, to_vector3<FloatType>(*opt), *bc);
             } else {
               m_boundary->remove_node_from_boundary(node, *bc);
             }
-            ++it;
-          }
+          };
+
+          mapping_block_to_local(bci, ci, block_offset, lower_corner, func);
         }
       }
     }
@@ -1410,7 +1619,10 @@ class LBWalberlaImpl : public LBWalberlaBase {
   }
 
   bool remove_node_from_boundary(Utils::Vector3i const &node) override {
-    auto bc = get_block_and_cell(get_lattice(), node, true);
+    auto bc = get_block_and_cell(get_lattice(), node, false);
+    if (!bc) {
+      bc = get_block_and_cell(get_lattice(), node, true);
+    }
     if (bc) {
       m_boundary->remove_node_from_boundary(node, *bc);
     }
@@ -1433,21 +1645,24 @@ class LBWalberlaImpl : public LBWalberlaBase {
                         Utils::Vector3i const &upper_corner) const override {
     std::vector<bool> out;
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
+      out = std::vector<bool>(ci->numCells());
       auto const &lattice = get_lattice();
-      auto const local_offset = std::get<0>(lattice.get_local_grid_range());
-      auto const lower_cell = ci->min();
-      auto const upper_cell = ci->max();
-      auto const n_values = ci->numCells();
-      out.reserve(n_values);
-      for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
-        for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
-          for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
-            auto const node = local_offset + Utils::Vector3i{x, y, z};
-            out.emplace_back(m_boundary->node_is_boundary(node));
-          }
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto const &block = *b;
+        auto const block_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                block_offset, block)) {
+
+          auto func = [&out, this](uint_t block_index, uint_t local_index,
+                                   Utils::Vector3i node) {
+            out[local_index] = m_boundary->node_is_boundary(node);
+          };
+
+          mapping_block_to_local(bci, ci, block_offset, lower_corner, func);
         }
       }
-      assert(out.size() == n_values);
+      assert(out.size() == ci->numCells());
     }
     return out;
   }
@@ -1501,20 +1716,32 @@ class LBWalberlaImpl : public LBWalberlaBase {
       Utils::Vector3i const &upper_corner) const override {
     std::vector<double> out;
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
+      out = std::vector<double>(static_cast<unsigned int>(9u * ci->numCells()));
       auto const &lattice = get_lattice();
-      auto const &block = *(lattice.get_blocks()->begin());
-      auto const pdf_field = block.template getData<PdfField>(m_pdf_field_id);
-      auto values = lbm::accessor::PressureTensor::get(pdf_field, *ci);
-      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
-      assert(values.size() == 9u * ci->numCells());
-      for (auto it = values.begin(); it != values.end(); std::advance(it, 9l)) {
-        pressure_tensor_correction(std::span<FloatType, 9ul>(it, 9ul));
-      }
-      if constexpr (std::is_same_v<typename decltype(values)::value_type,
-                                   double>) {
-        out = std::move(values);
-      } else {
-        out = std::vector<double>(values.begin(), values.end());
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto const &block = *b;
+        auto const block_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                block_offset, block)) {
+          auto const pdf_field =
+              block.template getData<PdfField>(m_pdf_field_id);
+          auto values = lbm::accessor::PressureTensor::get(pdf_field, *bci);
+          assert(values.size() == 9u * bci->numCells());
+
+          auto func = [&values, &out, this](uint_t block_index,
+                                            uint_t local_index,
+                                            Utils::Vector3i node) {
+            pressure_tensor_correction(std::span<FloatType, 9ul>(
+                &values[static_cast<unsigned int>(9u * block_index)], 9ul));
+            for (uint_t f = 0u; f < 9u; ++f) {
+              out[static_cast<unsigned int>(9u * local_index + f)] =
+                  values[static_cast<unsigned int>(9u * block_index + f)];
+            }
+          };
+
+          mapping_block_to_local(bci, ci, block_offset, lower_corner, func);
+        }
       }
     }
     return out;
diff --git a/src/walberla_bridge/src/utils/boundary.hpp b/src/walberla_bridge/src/utils/boundary.hpp
index 719c028aa4..dbd2a9ab25 100644
--- a/src/walberla_bridge/src/utils/boundary.hpp
+++ b/src/walberla_bridge/src/utils/boundary.hpp
@@ -85,15 +85,15 @@ void set_boundary_from_grid(BoundaryModel &boundary,
 
   auto const &conv = es2walberla<DataType, typename BoundaryModel::value_type>;
   auto const grid_size = lattice.get_grid_dimensions();
-  auto const offset = lattice.get_local_grid_range().first;
   auto const gl = static_cast<int>(lattice.get_ghost_layers());
   assert(raster_flat.size() ==
          static_cast<std::size_t>(Utils::product(grid_size)));
   auto const n_y = static_cast<std::size_t>(grid_size[1]);
   auto const n_z = static_cast<std::size_t>(grid_size[2]);
 
-  for (auto const &block : *lattice.get_blocks()) {
+  for (auto &block : *lattice.get_blocks()) {
     auto const [size_i, size_j, size_k] = boundary.block_dims(block);
+    auto const offset = to_vector3i(block.getAABB().min());
     // Get field data which knows about the indices
     // In the loop, i,j,k are in block-local coordinates
     for (int i = -gl; i < size_i + gl; ++i) {
@@ -106,8 +106,9 @@ void set_boundary_from_grid(BoundaryModel &boundary,
                              static_cast<std::size_t>(idx[2]);
           if (raster_flat[index]) {
             auto const &value = data_flat[index];
-            auto const bc = get_block_and_cell(lattice, node, true);
-            assert(bc.has_value());
+            std::optional<BlockAndCell> bc;
+            bc->block = &block;
+            bc->cell = Cell(i, j, k);
             boundary.set_node_value_at_boundary(node, conv(value), *bc);
           }
         }
diff --git a/src/walberla_bridge/src/utils/types_conversion.hpp b/src/walberla_bridge/src/utils/types_conversion.hpp
index 6f196cb57a..12e89ceb15 100644
--- a/src/walberla_bridge/src/utils/types_conversion.hpp
+++ b/src/walberla_bridge/src/utils/types_conversion.hpp
@@ -68,6 +68,28 @@ inline Utils::VectorXd<9> to_vector9d(Matrix3<float> const &m) {
           double_c(m[3]), double_c(m[4]), double_c(m[5]),
           double_c(m[6]), double_c(m[7]), double_c(m[8])};
 }
+inline Utils::Vector3i to_vector3i(Vector3<float> const &v) {
+#ifndef NDEBUG
+  for (auto const i : {0u, 1u, 2u}) {
+    assert(std::abs(v[i] - static_cast<float>(
+                               static_cast<int>(std::round(v[i])))) < 1e-3);
+  }
+#endif
+  return Utils::Vector3i{{static_cast<int>(std::round(v[0])),
+                          static_cast<int>(std::round(v[1])),
+                          static_cast<int>(std::round(v[2]))}};
+}
+inline Utils::Vector3i to_vector3i(Vector3<double> const &v) {
+#ifndef NDEBUG
+  for (auto const i : {0u, 1u, 2u}) {
+    assert(std::abs(v[i] - double_c(static_cast<int>(std::round(v[i])))) <
+           1e-3);
+  }
+#endif
+  return Utils::Vector3i{{static_cast<int>(std::round(v[0])),
+                          static_cast<int>(std::round(v[1])),
+                          static_cast<int>(std::round(v[2]))}};
+}
 
 template <typename Function>
 void interpolate_bspline_at_pos(Utils::Vector3d const &pos, Function const &f) {
diff --git a/src/walberla_bridge/tests/EKinWalberlaImpl_unit_tests.cpp b/src/walberla_bridge/tests/EKinWalberlaImpl_unit_tests.cpp
index 210b5edb57..3e086d7c63 100644
--- a/src/walberla_bridge/tests/EKinWalberlaImpl_unit_tests.cpp
+++ b/src/walberla_bridge/tests/EKinWalberlaImpl_unit_tests.cpp
@@ -570,8 +570,8 @@ int main(int argc, char **argv) {
   params.ext_efield = Vector3d{0.01, 0.02, 0.03};
   params.grid_dimensions = Vector3i{12, 12, 18};
   params.box_dimensions = Vector3d{12, 12, 18};
-  params.lattice =
-      std::make_shared<LatticeWalberla>(params.grid_dimensions, mpi_shape, 1u);
+  params.lattice = std::make_shared<LatticeWalberla>(params.grid_dimensions,
+                                                     mpi_shape, mpi_shape, 1u);
 
   auto const res = boost::unit_test::unit_test_main(init_unit_test, argc, argv);
   MPI_Finalize();
diff --git a/src/walberla_bridge/tests/LBWalberlaImpl_bspline_tests.cpp b/src/walberla_bridge/tests/LBWalberlaImpl_bspline_tests.cpp
index a0123cbe67..3be29c54d1 100644
--- a/src/walberla_bridge/tests/LBWalberlaImpl_bspline_tests.cpp
+++ b/src/walberla_bridge/tests/LBWalberlaImpl_bspline_tests.cpp
@@ -156,8 +156,8 @@ int main(int argc, char **argv) {
   params.density = 1.4;
   params.grid_dimensions = Vector3i{12, 6, 9};
   params.box_dimensions = Vector3d{12, 6, 9};
-  params.lattice =
-      std::make_shared<LatticeWalberla>(params.grid_dimensions, mpi_shape, 1u);
+  params.lattice = std::make_shared<LatticeWalberla>(params.grid_dimensions,
+                                                     mpi_shape, mpi_shape, 1u);
 
   auto const res = boost::unit_test::unit_test_main(init_unit_test, argc, argv);
   MPI_Finalize();
diff --git a/src/walberla_bridge/tests/LBWalberlaImpl_field_accessors_tests.cu b/src/walberla_bridge/tests/LBWalberlaImpl_field_accessors_tests.cu
index 30c98ab2e7..f02c76c188 100644
--- a/src/walberla_bridge/tests/LBWalberlaImpl_field_accessors_tests.cu
+++ b/src/walberla_bridge/tests/LBWalberlaImpl_field_accessors_tests.cu
@@ -106,11 +106,13 @@ boost::test_tools::predicate_result almost_equal(R const &val, R const &ref,
   for (auto i = 0ul; i < val.size(); ++i) {
     if (auto const diff = std::abs(val[i] - ref[i]); diff > atol) {
       res = false;
-      res.message() << "val{" << print_first_n(val) << "} and " << "ref{"
-                    << print_first_n(ref) << "} mismatch: " << "val[" << i
-                    << "]{" << val[i] << "} != " << "ref[" << i << "]{"
-                    << ref[i] << "} " << "(difference{" << diff << "} > delta{"
-                    << atol << "})";
+      // clang-format off
+      res.message() << "val{" << print_first_n(val) << "} and "
+                    << "ref{" << print_first_n(ref) << "} mismatch: "
+		    << "val[" << i << "]{" << val[i] << "} != "
+		    << "ref[" << i << "]{" << ref[i] << "} "
+		    << "(difference{" << diff << "} > delta{" << atol << "})";
+      // clang-format on
       break;
     }
   }
@@ -156,7 +158,8 @@ template <typename FT, lbmpy::Arch Architecture> struct Fixture {
     auto const grid_dim = Utils::Vector3i::broadcast(4);
     auto const viscosity = FT(1.5);
     auto const density = FT(0.9);
-    lattice = std::make_shared<::LatticeWalberla>(grid_dim, mpi_shape, 1u);
+    lattice =
+        std::make_shared<::LatticeWalberla>(grid_dim, mpi_shape, mpi_shape, 1u);
     lbfluid = std::make_shared<LBWalberlaImplTest<FT, Architecture>>(
         lattice, viscosity, density);
   }
diff --git a/src/walberla_bridge/tests/LBWalberlaImpl_flow_tests.cpp b/src/walberla_bridge/tests/LBWalberlaImpl_flow_tests.cpp
index 36526ee3ce..96049bff27 100644
--- a/src/walberla_bridge/tests/LBWalberlaImpl_flow_tests.cpp
+++ b/src/walberla_bridge/tests/LBWalberlaImpl_flow_tests.cpp
@@ -167,8 +167,8 @@ int main(int argc, char **argv) {
   params.density = 1.4;
   params.grid_dimensions = Vector3i{12, 12, 18};
   params.box_dimensions = Vector3d{6, 6, 9};
-  params.lattice =
-      std::make_shared<LatticeWalberla>(params.grid_dimensions, mpi_shape, 1u);
+  params.lattice = std::make_shared<LatticeWalberla>(params.grid_dimensions,
+                                                     mpi_shape, mpi_shape, 1u);
 
   auto const res = boost::unit_test::unit_test_main(init_unit_test, argc, argv);
   MPI_Finalize();
diff --git a/src/walberla_bridge/tests/LBWalberlaImpl_lees_edwards_tests.cpp b/src/walberla_bridge/tests/LBWalberlaImpl_lees_edwards_tests.cpp
index 8e66ed037e..44667b4fa0 100644
--- a/src/walberla_bridge/tests/LBWalberlaImpl_lees_edwards_tests.cpp
+++ b/src/walberla_bridge/tests/LBWalberlaImpl_lees_edwards_tests.cpp
@@ -71,8 +71,8 @@ BOOST_AUTO_TEST_CASE(test_transient_shear) {
   using LBImplementation = walberla::LBWalberlaImpl<double, lbmpy::Arch::CPU>;
   double density = 1;
   double viscosity = 1. / 7.;
-  auto lattice =
-      std::make_shared<LatticeWalberla>(Vector3i{8, 64, 8}, mpi_shape, 1);
+  auto lattice = std::make_shared<LatticeWalberla>(Vector3i{8, 64, 8},
+                                                   mpi_shape, mpi_shape, 1);
   auto lb = LBImplementation(lattice, viscosity, density);
   auto le_pack = std::make_unique<LeesEdwardsPack>(
       0u, 1u, []() { return 0.0; }, [=]() { return v0; });
@@ -96,8 +96,8 @@ static auto setup_lb_with_offset(double offset) {
   using LBImplementation = walberla::LBWalberlaImpl<double, lbmpy::Arch::CPU>;
   auto density = 1.;
   auto viscosity = 1. / 7.;
-  auto lattice =
-      std::make_shared<LatticeWalberla>(Vector3i{10, 10, 10}, mpi_shape, 1);
+  auto lattice = std::make_shared<LatticeWalberla>(Vector3i{10, 10, 10},
+                                                   mpi_shape, mpi_shape, 1);
   auto lb = std::make_shared<LBImplementation>(lattice, viscosity, density);
   auto le_pack = std::make_unique<LeesEdwardsPack>(
       0u, 1u, [=]() { return offset; }, []() { return 0.0; });
diff --git a/src/walberla_bridge/tests/LBWalberlaImpl_statistical_tests.cpp b/src/walberla_bridge/tests/LBWalberlaImpl_statistical_tests.cpp
index 9732bc8a71..30e4b4b695 100644
--- a/src/walberla_bridge/tests/LBWalberlaImpl_statistical_tests.cpp
+++ b/src/walberla_bridge/tests/LBWalberlaImpl_statistical_tests.cpp
@@ -132,8 +132,8 @@ int main(int argc, char **argv) {
   params.density = 1.4;
   params.grid_dimensions = Vector3i{12, 12, 18};
   params.box_dimensions = Vector3d{6, 6, 9};
-  params.lattice =
-      std::make_shared<LatticeWalberla>(params.grid_dimensions, mpi_shape, 1u);
+  params.lattice = std::make_shared<LatticeWalberla>(params.grid_dimensions,
+                                                     mpi_shape, mpi_shape, 1u);
 
   auto const res = boost::unit_test::unit_test_main(init_unit_test, argc, argv);
   MPI_Finalize();
diff --git a/src/walberla_bridge/tests/LBWalberlaImpl_unit_tests.cpp b/src/walberla_bridge/tests/LBWalberlaImpl_unit_tests.cpp
index c3352fcbed..c473a4fc78 100644
--- a/src/walberla_bridge/tests/LBWalberlaImpl_unit_tests.cpp
+++ b/src/walberla_bridge/tests/LBWalberlaImpl_unit_tests.cpp
@@ -587,8 +587,8 @@ BOOST_DATA_TEST_CASE(vtk_exceptions,
 
 BOOST_AUTO_TEST_CASE(lb_exceptions) {
   using LB = walberla::LBWalberlaImpl<double, lbmpy::Arch::CPU>;
-  auto lb_lattice_without_ghosts =
-      std::make_shared<LatticeWalberla>(params.grid_dimensions, mpi_shape, 0u);
+  auto lb_lattice_without_ghosts = std::make_shared<LatticeWalberla>(
+      params.grid_dimensions, mpi_shape, mpi_shape, 0u);
   BOOST_CHECK_THROW(LB(lb_lattice_without_ghosts, 1., 1.), std::runtime_error);
 }
 
@@ -630,8 +630,8 @@ int main(int argc, char **argv) {
   params.density = 1.4;
   params.grid_dimensions = Vector3i{12, 12, 18};
   params.box_dimensions = Vector3d{12, 12, 18};
-  params.lattice =
-      std::make_shared<LatticeWalberla>(params.grid_dimensions, mpi_shape, 1u);
+  params.lattice = std::make_shared<LatticeWalberla>(params.grid_dimensions,
+                                                     mpi_shape, mpi_shape, 1u);
 
   auto const res = boost::unit_test::unit_test_main(init_unit_test, argc, argv);
   MPI_Finalize();
diff --git a/src/walberla_bridge/tests/LatticeWalberla_unit_tests.cpp b/src/walberla_bridge/tests/LatticeWalberla_unit_tests.cpp
index 977586ad89..8385981e93 100644
--- a/src/walberla_bridge/tests/LatticeWalberla_unit_tests.cpp
+++ b/src/walberla_bridge/tests/LatticeWalberla_unit_tests.cpp
@@ -52,8 +52,8 @@ static LatticeTestParameters params; // populated in main()
 static Vector3i mpi_shape;           // populated in main
 
 BOOST_DATA_TEST_CASE(domain_and_halo, bdata::xrange(3u), n_ghost_layers) {
-  auto const lattice =
-      LatticeWalberla(params.grid_dimensions, mpi_shape, n_ghost_layers);
+  auto const lattice = LatticeWalberla(params.grid_dimensions, mpi_shape,
+                                       mpi_shape, n_ghost_layers);
   auto const [my_left, my_right] = lattice.get_local_domain();
 
   for (auto const &n : all_nodes_incl_ghosts(lattice)) {
@@ -104,7 +104,7 @@ BOOST_AUTO_TEST_CASE(exceptions) {
     auto grid_dims = Vector3i::broadcast(1);
     grid_dims[i] = 3;
     node_grid[i] = 2;
-    BOOST_CHECK_THROW(LatticeWalberla(grid_dims, node_grid, 1u),
+    BOOST_CHECK_THROW(LatticeWalberla(grid_dims, node_grid, node_grid, 1u),
                       std::runtime_error);
   }
 }
diff --git a/testsuite/python/lb.py b/testsuite/python/lb.py
index a16ff6c5a3..c062d37a55 100644
--- a/testsuite/python/lb.py
+++ b/testsuite/python/lb.py
@@ -517,7 +517,9 @@ def test_agrid_rounding(self):
         phi = 0.05
         lj_sig = 1.0
         l = (n_part * 4. / 3. * np.pi * (lj_sig / 2.)**3 / phi)**(1. / 3.)
-        system.box_l = [l] * 3 * np.array(system.cell_system.node_grid)
+        system.box_l = l * np.array(system.cell_system.node_grid)
+        if hasattr(self, 'blocks_per_mpi_rank'):
+            system.box_l = system.box_l * np.array(self.blocks_per_mpi_rank)
         lbf = self.lb_class(agrid=l / 31, density=1, kinematic_viscosity=1, kT=0,
                             tau=system.time_step, **self.lb_params)
         system.lb = lbf
@@ -825,6 +827,25 @@ def params_with_tau(tau):
         np.testing.assert_allclose(v1, v2, rtol=1e-2)
         np.testing.assert_allclose(f1, f2, rtol=1e-2)
 
+    def test_raise_block_grid_mismatch(self):
+        if not hasattr(self, 'blocks_per_mpi_rank'):
+            self.skipTest(
+                "Skipping test: this test is only for the systme allocating multiple blocks to one mpi rank")
+        with self.assertRaisesRegex(RuntimeError, "Lattice grid dimensions and block grid are not compatible"):
+            self.lb_class(
+                **self.params, single_precision=self.lb_params["single_precision"], blocks_per_mpi_rank=[11, 1, 1])
+
+    @utx.skipIfMissingGPU()
+    def test_raise_blocks_for_GPU(self):
+        if self.lb_class != espressomd.lb.LBFluidWalberlaGPU:
+            self.skipTest(
+                "Skipping test: this test is only for LBFluidWalberlaGPU")
+        blocks_per_mpi_rank = [2, 2, 2]
+        self.lb_params = {"single_precision": False,
+                          "blocks_per_mpi_rank": blocks_per_mpi_rank}
+        with self.assertRaisesRegex(RuntimeError, "Using more than one block per MPI rank is not supported for GPU LB"):
+            self.lb_class(**self.params, **self.lb_params)
+
 
 @utx.skipIfMissingFeatures("WALBERLA")
 class LBTestWalberlaDoublePrecisionCPU(LBTest, ut.TestCase):
@@ -864,5 +885,27 @@ class LBTestWalberlaSinglePrecisionGPU(LBTest, ut.TestCase):
     rtol = 2e-4
 
 
+@utx.skipIfMissingFeatures("WALBERLA")
+class LBTestWalberlaDoublePrecisionBlocksCPU(LBTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_lattice_class = espressomd.lb.LatticeWalberla
+    blocks_per_mpi_rank = [2, 2, 2]
+    lb_params = {"single_precision": False,
+                 "blocks_per_mpi_rank": blocks_per_mpi_rank}
+    atol = 1e-10
+    rtol = 1e-7
+
+
+@utx.skipIfMissingFeatures("WALBERLA")
+class LBTestWalberlaSinglePrecisionBlocksCPU(LBTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_lattice_class = espressomd.lb.LatticeWalberla
+    blocks_per_mpi_rank = [2, 2, 2]
+    lb_params = {"single_precision": True,
+                 "blocks_per_mpi_rank": blocks_per_mpi_rank}
+    atol = 1e-6
+    rtol = 2e-4
+
+
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/lb_boundary.py b/testsuite/python/lb_boundary.py
index 6ad5a6c0ad..7d46007335 100644
--- a/testsuite/python/lb_boundary.py
+++ b/testsuite/python/lb_boundary.py
@@ -125,5 +125,11 @@ class LBBoundariesWalberlaSinglePrecisionGPU(LBBoundariesBase, ut.TestCase):
     lb_params = {"single_precision": True}
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBBoundariesWalberlaDoublePrecisionCPU(LBBoundariesBase, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [2, 1, 1]}
+
+
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/lb_boundary_ghost_layer.py b/testsuite/python/lb_boundary_ghost_layer.py
index 84ce9180f0..46bcb36d3f 100644
--- a/testsuite/python/lb_boundary_ghost_layer.py
+++ b/testsuite/python/lb_boundary_ghost_layer.py
@@ -117,5 +117,12 @@ class LBPoiseuilleWalberlaDoublePrecisionGPU(TestCommon, ut.TestCase):
     lb_params = {"single_precision": False}
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+# @ut.skipIf(TestCommon.n_nodes != 2, "only runs for 2 MPI ranks")
+class LBPoiseuilleWalberlaDoublePrecisionBlocksCPU(TestCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [2, 1, 1]}
+
+
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/lb_boundary_volume_force.py b/testsuite/python/lb_boundary_volume_force.py
index 76beda388f..9f402839ba 100644
--- a/testsuite/python/lb_boundary_volume_force.py
+++ b/testsuite/python/lb_boundary_volume_force.py
@@ -111,5 +111,11 @@ class LBBoundaryForceWalberlaSinglePrecision(
     lb_params = {"single_precision": True}
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBBoundaryForceWalberlaBlocks(LBBoundaryForceCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [2, 2, 2]}
+
+
 if __name__ == '__main__':
     ut.main()
diff --git a/testsuite/python/lb_circular_couette.py b/testsuite/python/lb_circular_couette.py
index f16b6bbf24..2c9b1a1ad7 100644
--- a/testsuite/python/lb_circular_couette.py
+++ b/testsuite/python/lb_circular_couette.py
@@ -172,5 +172,17 @@ class LBCircularCouetteWalberlaSinglePrecisionGPU(LBCouetteTest, ut.TestCase):
     lb_params = {"single_precision": True}
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBCircularCouetteWalberlaDoublePRecisionBlocksCPU(LBCouetteTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [2, 2, 2]}
+
+
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBCircularCouetteWalberlaSinglePRecisionBlocksCPU(LBCouetteTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": True, "blocks_per_mpi_rank": [2, 2, 2]}
+
+
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/lb_couette_xy.py b/testsuite/python/lb_couette_xy.py
new file mode 100644
index 0000000000..930de14297
--- /dev/null
+++ b/testsuite/python/lb_couette_xy.py
@@ -0,0 +1,145 @@
+#
+# Copyright (C) 2021-2025 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import espressomd.lb
+import espressomd.lees_edwards
+
+import unittest as ut
+import unittest_decorators as utx
+import numpy as np
+
+
+LB_PARAMS = {'agrid': 1.,
+             'density': 1.,
+             'kinematic_viscosity': 1. / 6.,
+             'tau': 1.}
+
+system = espressomd.System(box_l=[32, 32, 32])
+system.time_step = LB_PARAMS['tau']
+system.cell_system.skin = 0.1
+system.cell_system.set_n_square()
+n_nodes = np.prod(system.cell_system.node_grid)
+
+coord_indexes = {"x": 0, "y": 1, "z": 2}
+
+
+class LBCouetteFlowCommon:
+
+    def analytical(self, x, t, nu, v, h, k_max):
+        """
+        Analytical solution with Fourier series of the Navier-Stokes equation.
+
+        Parameters
+        ----------
+        x : :obj:`float`
+            Height within the channel
+        t : :obj:`float`
+            Time since the start up of the shear flow
+        nu: :obj:`float`
+            Kinematic kinematic_viscosity
+        v: :obj:`float`
+            Shearing velocity
+        h : :obj:`float`
+            Distance between shear planes
+        k_max : :obj:`int`
+            Upper limit of sums for sinus series
+
+        """
+        u = x / h - 0.5
+        for k in np.arange(1, k_max + 1):
+            wave = 2 * np.pi * k / h
+            u += np.exp(-nu * wave ** 2 * t) * np.sin(wave * x) / (np.pi * k)
+        return v * u
+
+    def setUp(self):
+        system.time = 0.
+
+    # def tearDown(self):
+        system.lb = None
+        system.lees_edwards.protocol = None
+
+    def check_profile(self, u_getter, **kwargs):
+        # carefully select the domain decomposition
+        assert kwargs["shear_plane_normal"] == "y"
+        h = system.box_l[coord_indexes[kwargs["shear_plane_normal"]]]
+        shear_velocity = 0.05
+        k_max = 100
+
+        protocol = espressomd.lees_edwards.LinearShear(
+            shear_velocity=shear_velocity, initial_pos_offset=0., time_0=0.)
+        system.lees_edwards.set_boundary_conditions(
+            protocol=protocol, **kwargs)
+        agrid = LB_PARAMS["agrid"]
+
+        lbf = self.lb_class(**LB_PARAMS, **self.lb_params)
+        system.lb = lbf
+
+        # warmup
+        system.integrator.run(8)
+
+        # sampling
+        for i in range(4, 9):
+            steps = (2**i - 2**(i - 1))
+            system.integrator.run(steps)
+            pos = np.array(range(int(h))) + agrid / 2.
+            u_ref = self.analytical(pos, system.time - 1., lbf.kinematic_viscosity,
+                                    shear_velocity, h, k_max)
+            u_lbf = np.copy(u_getter(lbf).reshape([-1]))
+            np.testing.assert_allclose(u_lbf, u_ref, atol=1e-4, rtol=0.)
+
+    @ut.skipIf(n_nodes == 1, "test is designed to run on multiple MPI ranks")
+    @ut.expectedFailure
+    def test_profile_xy_divided_shear_direction(self):
+        system.cell_system.node_grid = [n_nodes, 1, 1]
+        self.check_profile(lambda lbf: lbf[5, :, 0].velocity[:, 0],
+                           shear_direction="x", shear_plane_normal="y")
+
+    @ut.skip("TODO: LB+Lees Edwards doesn't work for domain decomposition along shear plane normal direction")  # TODO
+    @ut.skipIf(n_nodes == 1, "test is designed to run on multiple MPI ranks")
+    def test_profile_xy_divided_normal_direction(self):
+        system.cell_system.node_grid = [1, n_nodes, 1]
+        self.check_profile(lambda lbf: lbf[5, :, 0].velocity[:, 0],
+                           shear_direction="x", shear_plane_normal="y")
+
+    def test_profile_xy_divided_z_direction(self):
+        system.cell_system.node_grid = [1, 1, n_nodes]
+        self.check_profile(lambda lbf: lbf[5, :, 0].velocity[:, 0],
+                           shear_direction="x", shear_plane_normal="y")
+
+
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBCouetteFlowWalberla(LBCouetteFlowCommon, ut.TestCase):
+
+    """Test for the Walberla implementation of the LB in double-precision."""
+
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False}
+
+
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBCouetteFlowWalberlaSinglePrecision(LBCouetteFlowCommon, ut.TestCase):
+
+    """Test for the Walberla implementation of the LB in single-precision."""
+
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": True}
+
+
+if __name__ == '__main__':
+    ut.main()
diff --git a/testsuite/python/lb_interpolation.py b/testsuite/python/lb_interpolation.py
index 3a2bd9a491..f67fcdf268 100644
--- a/testsuite/python/lb_interpolation.py
+++ b/testsuite/python/lb_interpolation.py
@@ -55,6 +55,7 @@ class LBInterpolation:
     system = espressomd.System(box_l=[BOX_L] * 3)
     system.cell_system.skin = 0.4 * AGRID
     system.time_step = TIME_STEP
+    system.periodicity = [False, True, True]
 
     def setUp(self):
         self.lbf = self.lb_class(**LB_PARAMETERS, **self.lb_params)
@@ -187,5 +188,17 @@ class LBInterpolationWalberlaSinglePrecisionGPU(LBInterpolation, ut.TestCase):
     lb_params = {"single_precision": True}
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBInterpolationWalberlaDoublePrecisionBlocksCPU(LBInterpolation, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [2, 2, 2]}
+
+
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBInterpolationWalberlaSinglePrecisionBlocksCPU(LBInterpolation, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": True, "blocks_per_mpi_rank": [2, 2, 2]}
+
+
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/lb_mass_conservation.py b/testsuite/python/lb_mass_conservation.py
index fcbbab66b6..0f0ae30631 100644
--- a/testsuite/python/lb_mass_conservation.py
+++ b/testsuite/python/lb_mass_conservation.py
@@ -41,7 +41,7 @@ class LBMassCommon:
 
     """Check the lattice-Boltzmann mass conservation."""
 
-    system = espressomd.System(box_l=[3.0, 3.0, 3.0])
+    system = espressomd.System(box_l=[4.0, 4.0, 4.0])
     system.time_step = TIME_STEP
     system.cell_system.skin = 0.4 * AGRID
 
@@ -96,5 +96,14 @@ class LBMassWalberlaSinglePrecisionGPU(LBMassCommon, ut.TestCase):
     atol = 5e-7
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBMassWalberlaDoublePrecisionBlocksCPU(LBMassCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    blocks_per_mpi_rank = [1, 1, 2]
+    lb_params = {"single_precision": False,
+                 "blocks_per_mpi_rank": blocks_per_mpi_rank}
+    atol = 1e-10
+
+
 if __name__ == '__main__':
     ut.main()
diff --git a/testsuite/python/lb_momentum_conservation.py b/testsuite/python/lb_momentum_conservation.py
index 0d72f83ec5..f64c0543a5 100644
--- a/testsuite/python/lb_momentum_conservation.py
+++ b/testsuite/python/lb_momentum_conservation.py
@@ -218,5 +218,19 @@ def set_cellsystem(self):
         self.system.cell_system.set_n_square()
 
 
+@ut.skipIf(TestLBMomentumConservation.n_nodes != 1,
+           "LB with regular decomposition already tested with 2 MPI ranks")
+@utx.skipIfMissingFeatures(["WALBERLA", "EXTERNAL_FORCES"])
+class TestLBMomentumConservationRegularDoublePrecisionWalberlaBlocksCPU(
+        TestLBMomentumConservation, ut.TestCase):
+
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [2, 2, 2]}
+    atol = 1.2e-4
+
+    def set_cellsystem(self):
+        self.system.cell_system.set_regular_decomposition()
+
+
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/lb_planar_couette.py b/testsuite/python/lb_planar_couette.py
index 7295128b86..991284bcab 100644
--- a/testsuite/python/lb_planar_couette.py
+++ b/testsuite/python/lb_planar_couette.py
@@ -111,11 +111,24 @@ def check_profile(self, u_getter, **kwargs):
             np.testing.assert_allclose(u_lbf, u_ref, atol=1e-4, rtol=0.)
 
     def test_profile_xy(self):
-        self.check_profile(lambda lbf: lbf[5, :, 0].velocity[:, 0],
-                           shear_direction="x", shear_plane_normal="y")
+        if hasattr(self, 'blocks_per_mpi_rank'):
+            if self.blocks_per_mpi_rank[0] != 1:
+                with self.assertRaises(ValueError):
+                    self.check_profile(lambda lbf: lbf[5, :, 0].velocity[:, 0],
+                                       shear_direction="x", shear_plane_normal="y")
+            else:
+                self.skipTest(
+                    "Skipping test: only runs for blocks_per_mpi_rank=[X,1,1], where X is any integer")
+
+        else:
+            self.check_profile(lambda lbf: lbf[5, :, 0].velocity[:, 0],
+                               shear_direction="x", shear_plane_normal="y")
 
     @ut.skipIf(n_nodes > 1, "Skipping test: only runs for n_nodes == 1")
     def test_profile_zy(self):
+        if hasattr(self, 'blocks_per_mpi_rank'):
+            self.skipTest(
+                "Skipping test: only runs without blocks_per_mpi_rank")
         self.check_profile(lambda lbf: lbf[0, :, 5].velocity[:, 0],
                            shear_direction="z", shear_plane_normal="y")
 
@@ -142,5 +155,18 @@ class LBCouetteFlowWalberlaSinglePrecision(LBCouetteFlowCommon, ut.TestCase):
     lb_params = {"single_precision": True}
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+@ut.skipIf(LBCouetteFlowCommon.n_nodes > 2,
+           "Skipping test: only runs for n_nodes <= 2")
+class LBCouetteFlowWalberlaBlocks(LBCouetteFlowCommon, ut.TestCase):
+
+    """Test for the Walberla implementation of the LB in double-precision."""
+
+    lb_class = espressomd.lb.LBFluidWalberla
+    blocks_per_mpi_rank = [2, 1, 1]
+    lb_params = {"single_precision": False,
+                 "blocks_per_mpi_rank": blocks_per_mpi_rank}
+
+
 if __name__ == '__main__':
     ut.main()
diff --git a/testsuite/python/lb_poiseuille.py b/testsuite/python/lb_poiseuille.py
index 9a4178d7af..4b259653e7 100644
--- a/testsuite/python/lb_poiseuille.py
+++ b/testsuite/python/lb_poiseuille.py
@@ -117,6 +117,7 @@ def test_profile(self):
                                      EXT_FORCE,
                                      KINEMATIC_VISC * DENS)
         np.testing.assert_allclose(v_measured, v_expected, rtol=5E-5)
+        # np.testing.assert_allclose(v_measured, v_expected, rtol=5E-5, atol=8E-4)
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
@@ -145,5 +146,17 @@ class LBPoiseuilleWalberlaSinglePrecisionGPU(LBPoiseuilleCommon, ut.TestCase):
     lb_params = {"single_precision": True}
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBPoiseuilleWalberlaDoublePrecisionBlocksCPU(LBPoiseuilleCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [2, 2, 2]}
+
+
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBPoiseuilleWalberlaSinglePrecisionBlocksCPU(LBPoiseuilleCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": True, "blocks_per_mpi_rank": [2, 2, 2]}
+
+
 if __name__ == '__main__':
     ut.main()
diff --git a/testsuite/python/lb_poiseuille_cylinder.py b/testsuite/python/lb_poiseuille_cylinder.py
index 4499f8661d..aa6493b48c 100644
--- a/testsuite/python/lb_poiseuille_cylinder.py
+++ b/testsuite/python/lb_poiseuille_cylinder.py
@@ -222,5 +222,11 @@ class LBPoiseuilleWalberlaSinglePrecisionGPU(LBPoiseuilleCommon, ut.TestCase):
     lb_params = {"single_precision": True}
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBPoiseuilleWalberlaDoublePrecisionBlocksCPU(LBPoiseuilleCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [2, 2, 2]}
+
+
 if __name__ == '__main__':
     ut.main()
diff --git a/testsuite/python/lb_pressure_tensor.py b/testsuite/python/lb_pressure_tensor.py
index 0c0b4df3a6..263ff9295f 100644
--- a/testsuite/python/lb_pressure_tensor.py
+++ b/testsuite/python/lb_pressure_tensor.py
@@ -156,6 +156,14 @@ class TestLBPressureTensorCPU(TestLBPressureTensor, ut.TestCase):
     steps = 5000
 
 
+@utx.skipIfMissingFeatures("WALBERLA")
+class TestLBPressureTensorBlocksCPU(TestLBPressureTensor, ut.TestCase):
+
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": True, "blocks_per_mpi_rank": [2, 2, 2]}
+    steps = 5000
+
+
 # TODO WALBERLA
 """
 @utx.skipIfMissingFeatures("WALBERLA")
diff --git a/testsuite/python/lb_shear.py b/testsuite/python/lb_shear.py
index fef0838ba6..3f637cb3af 100644
--- a/testsuite/python/lb_shear.py
+++ b/testsuite/python/lb_shear.py
@@ -85,7 +85,7 @@ class LBShearCommon:
     system.cell_system.skin = 0.4 * AGRID
 
     def setUp(self):
-        self.lbf = self.lb_class(**LB_PARAMS, **self.lb_params)
+        self.system.lb = None
 
     def tearDown(self):
         self.system.lb = None
@@ -96,9 +96,14 @@ def check_profile(self, shear_plane_normal, shear_direction):
         the exact solution.
         """
         self.tearDown()
-        self.system.box_l = np.max(
-            ((W, W, W), shear_plane_normal * (H + 2 * AGRID)), 0)
-        self.setUp()
+        if hasattr(self, 'blocks_per_mpi_rank'):
+            self.system.box_l = np.max(
+                ((W, W, W) * np.array(self.blocks_per_mpi_rank),
+                 shear_plane_normal * (H + 2 * AGRID) * np.array(self.blocks_per_mpi_rank)), 0)
+        else:
+            self.system.box_l = np.max(
+                ((W, W, W), shear_plane_normal * (H + 2 * AGRID)), 0)
+        self.lbf = self.lb_class(**LB_PARAMS, **self.lb_params)
         self.system.lb = self.lbf
         self.lbf.clear_boundaries()
 
@@ -204,5 +209,18 @@ class LBShearWalberlaSinglePrecision(LBShearCommon, ut.TestCase):
     rtol = 5e-3
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBShearWalberlaBlocks(LBShearCommon, ut.TestCase):
+
+    """Test for the Walberla implementation of the LB in double-precision."""
+
+    lb_class = espressomd.lb.LBFluidWalberla
+    blocks_per_mpi_rank = [2, 2, 2]
+    lb_params = {"single_precision": False,
+                 "blocks_per_mpi_rank": blocks_per_mpi_rank}
+    atol = 5e-5
+    rtol = 5e-4
+
+
 if __name__ == '__main__':
     ut.main()
diff --git a/testsuite/python/lb_slice.py b/testsuite/python/lb_slice.py
index 09a49dc4bd..c2a43def65 100644
--- a/testsuite/python/lb_slice.py
+++ b/testsuite/python/lb_slice.py
@@ -200,5 +200,19 @@ class LBTestWalberlaSinglePrecisionGPU(LBTest, ut.TestCase):
     lb_params = {"single_precision": True}
 
 
+@utx.skipIfMissingFeatures("WALBERLA")
+class LBTestWalberlaDoublePrecisionBlocksCPU(LBTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_lattice_class = espressomd.lb.LatticeWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [1, 1, 2]}
+
+
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBTestWalberlaSinglePrecisionBlocksCPU(LBTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_lattice_class = espressomd.lb.LatticeWalberla
+    lb_params = {"single_precision": True, "blocks_per_mpi_rank": [1, 1, 2]}
+
+
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/lb_streaming.py b/testsuite/python/lb_streaming.py
index ad9fefa350..8798d1474f 100644
--- a/testsuite/python/lb_streaming.py
+++ b/testsuite/python/lb_streaming.py
@@ -163,5 +163,13 @@ class LBStreamingWalberlaSinglePrecisionGPU(LBStreamingCommon, ut.TestCase):
     rtol = 1e-5
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBStreamingWalberlaDoublePrecisionBlocksCPU(LBStreamingCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [1, 2, 2]}
+    box_l = [3., 2., 2.]
+    rtol = 1e-10
+
+
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/lb_thermostat.py b/testsuite/python/lb_thermostat.py
index 53de0ea8ca..a5027cf699 100644
--- a/testsuite/python/lb_thermostat.py
+++ b/testsuite/python/lb_thermostat.py
@@ -245,5 +245,11 @@ class LBThermostatWalberlaSinglePrecisionGPU(LBThermostatCommon, ut.TestCase):
     lb_params = {"single_precision": True}
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBThermostatWalberlaDoublePrecisionBlocksCPU(LBThermostatCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [2, 2, 2]}
+
+
 if __name__ == '__main__':
     ut.main()
diff --git a/testsuite/python/save_checkpoint.py b/testsuite/python/save_checkpoint.py
index 504ec63546..31f9ce85f9 100644
--- a/testsuite/python/save_checkpoint.py
+++ b/testsuite/python/save_checkpoint.py
@@ -75,7 +75,7 @@
     protocol = espressomd.lees_edwards.LinearShear(
         initial_pos_offset=0.1, time_0=0.2, shear_velocity=1.2)
     system.lees_edwards.set_boundary_conditions(
-        shear_direction="x", shear_plane_normal="y", protocol=protocol)
+        shear_direction="z", shear_plane_normal="y", protocol=protocol)
 
 has_ase = "ASE" in modes
 
diff --git a/testsuite/python/test_checkpoint.py b/testsuite/python/test_checkpoint.py
index f2193a9c7c..05b45c5a37 100644
--- a/testsuite/python/test_checkpoint.py
+++ b/testsuite/python/test_checkpoint.py
@@ -378,7 +378,7 @@ def test_system_variables(self):
     def test_lees_edwards(self):
         lebc = system.lees_edwards
         protocol = lebc.protocol
-        self.assertEqual(lebc.shear_direction, "x")
+        self.assertEqual(lebc.shear_direction, "z")
         self.assertEqual(lebc.shear_plane_normal, "y")
         self.assertIsInstance(protocol, espressomd.lees_edwards.LinearShear)
         self.assertAlmostEqual(protocol.initial_pos_offset, 0.1, delta=1e-10)