espressomd · hidekb · Nov 6, 2024 · Nov 6, 2024 · Jan 7, 2025 · Jan 8, 2025
diff --git a/maintainer/benchmarks/lb.py b/maintainer/benchmarks/lb.py
@@ -50,6 +50,18 @@
 parser.add_argument("--output", metavar="FILEPATH", action="store",
                     type=str, required=False, default="benchmarks.csv",
                     help="Output file (default: benchmarks.csv)")
+parser.add_argument("--divided_block", action="store",
+                    type=int, default=1, required=False,
+                    help="blocks^(1/3) per mpi rank")
+parser.add_argument("--divided_block_x", action="store",
+                    type=int, default=0, required=False,
+                    help="The number of divided blocks for x direction")
+parser.add_argument("--divided_block_y", action="store",
+                    type=int, default=0, required=False,
+                    help="The number of divided blocks for x direction")
+parser.add_argument("--divided_block_z", action="store",
+                    type=int, default=0, required=False,
+                    help="The number of divided blocks for x direction")
 
 args = parser.parse_args()
 
@@ -85,7 +97,10 @@
 n_proc = system.cell_system.get_state()["n_nodes"]
 n_part = n_proc * args.particles_per_core
 if n_part == 0:
-    box_l = 3 * args.box_l if len(args.box_l) == 1 else args.box_l
+    if len(args.box_l) == 1:
+        box_l = 3 * args.box_l
+    elif len(args.box_l) == 3:
+        box_l = args.box_l
     agrid = 1.
     lb_grid = box_l
     measurement_steps = 80
@@ -101,13 +116,21 @@
     lb_grid = 3 * [lb_grid]
     box_l = 3 * [box_l]
 
-print(f"box length: {box_l}")
-print(f"LB shape: {lb_grid}")
-print(f"LB agrid: {agrid:.3f}")
+divided_block_x = args.divided_block_x
+divided_block_y = args.divided_block_y
+divided_block_z = args.divided_block_z
+if divided_block_x != 0 and divided_block_y != 0 and divided_block_z != 0:
+    blocks_per_mpi_rank = [divided_block_x,
+                           divided_block_y, divided_block_z]
+else:
+    divided_block = args.divided_block
+    blocks_per_mpi_rank = [divided_block] * 3
 
 # System
 #############################################################
-system.box_l = box_l
+system.box_l = box_l * system.cell_system.node_grid
+print(f"LB agrid: {agrid:.3f}")
+print("LB shape", system.box_l)
 
 # Integration parameters
 #############################################################
@@ -145,7 +168,7 @@
 if args.multi_gpu:
     system.cuda_init_handle.call_method("set_device_id_per_rank")
 lbf = lb_class(agrid=agrid, tau=system.time_step, kinematic_viscosity=1.,
-               density=1., single_precision=args.single_precision)
+               density=1., single_precision=args.single_precision, blocks_per_mpi_rank=blocks_per_mpi_rank)
 system.lb = lbf
 if n_part:
     system.thermostat.set_lb(LB_fluid=lbf, gamma=1., seed=42)

diff --git a/src/core/integrate.cpp b/src/core/integrate.cpp
@@ -633,12 +633,18 @@ int System::System::integrate(int n_steps, int reuse_forces) {
           ek.propagate();
         }
       } else if (lb_active) {
+#ifdef CALIPER
+        CALI_MARK_BEGIN("LB.PROPAGATE");
+#endif
         auto const md_steps_per_lb_step = calc_md_steps_per_tau(lb.get_tau());
         propagation.lb_skipped_md_steps += 1;
         if (propagation.lb_skipped_md_steps >= md_steps_per_lb_step) {
           propagation.lb_skipped_md_steps = 0;
           lb.propagate();
         }
+#ifdef CALIPER
+        CALI_MARK_END("LB.PROPAGATE");
+#endif
       } else if (ek_active) {
         auto const md_steps_per_ek_step = calc_md_steps_per_tau(ek.get_tau());
         propagation.ek_skipped_md_steps += 1;

diff --git a/src/core/lb/LBWalberla.cpp b/src/core/lb/LBWalberla.cpp
@@ -40,6 +40,10 @@
 #include <optional>
 #include <variant>
 
+#ifdef CALIPER
+#include <caliper/cali.h>
+#endif
+
 namespace LB {
 
 bool LBWalberla::is_gpu() const { return lb_fluid->is_gpu(); }
@@ -50,7 +54,15 @@ Utils::VectorXd<9> LBWalberla::get_pressure_tensor() const {
   return lb_fluid->get_pressure_tensor();
 }
 
-void LBWalberla::propagate() { lb_fluid->integrate(); }
+void LBWalberla::propagate() {
+#ifdef CALIPER
+  CALI_MARK_BEGIN("LBWalberla.PROPAGATE");
+#endif
+  lb_fluid->integrate();
+#ifdef CALIPER
+  CALI_MARK_END("LBWalberla.PROPAGATE");
+#endif
+}
 
 void LBWalberla::ghost_communication() { lb_fluid->ghost_communication(); }
 

diff --git a/src/core/lb/Solver.cpp b/src/core/lb/Solver.cpp
@@ -47,6 +47,10 @@
 #include <variant>
 #include <vector>
 
+#ifdef CALIPER
+#include <caliper/cali.h>
+#endif
+
 namespace LB {
 
 Solver::Solver() { impl = std::make_unique<Implementation>(); }
@@ -69,8 +73,14 @@ void Solver::reset() {
 }
 
 void Solver::propagate() {
+#ifdef CALIPER
+  CALI_MARK_BEGIN("SOLVER.PROPAGATE");
+#endif
   check_solver(impl);
   std::visit([](auto &ptr) { ptr->propagate(); }, *impl->solver);
+#ifdef CALIPER
+  CALI_MARK_END("SOLVER.PROPAGATE");
+#endif
 }
 
 void Solver::ghost_communication() {

diff --git a/src/core/unit_tests/ek_interface_test.cpp b/src/core/unit_tests/ek_interface_test.cpp
@@ -83,7 +83,8 @@ static auto make_ek_actor() {
   auto constexpr n_ghost_layers = 1u;
   auto constexpr single_precision = true;
   ek_lattice = std::make_shared<LatticeWalberla>(
-      params.grid_dimensions, ::communicator.node_grid, n_ghost_layers);
+      params.grid_dimensions, ::communicator.node_grid,
+      ::communicator.node_grid, n_ghost_layers);
   ek_container = std::make_shared<EK::EKWalberla::ek_container_type>(
       params.tau, walberla::new_ek_poisson_none(ek_lattice, single_precision));
   ek_reactions = std::make_shared<EK::EKWalberla::ek_reactions_type>();

diff --git a/src/core/unit_tests/lb_particle_coupling_test.cpp b/src/core/unit_tests/lb_particle_coupling_test.cpp
@@ -102,7 +102,8 @@ static auto make_lb_actor() {
   auto constexpr single_precision = false;
   lb_params = std::make_shared<LB::LBWalberlaParams>(params.agrid, params.tau);
   lb_lattice = std::make_shared<LatticeWalberla>(
-      params.grid_dimensions, ::communicator.node_grid, n_ghost_layers);
+      params.grid_dimensions, ::communicator.node_grid,
+      ::communicator.node_grid, n_ghost_layers);
   lb_fluid = new_lb_walberla_cpu(lb_lattice, params.viscosity, params.density,
                                  single_precision);
   lb_fluid->set_collision_model(params.kT, params.seed);
@@ -535,7 +536,8 @@ bool test_lb_domain_mismatch_local() {
   auto const params = std::make_shared<LB::LBWalberlaParams>(0.5, 0.01);
   ::communicator.node_grid = node_grid_reversed;
   auto const lattice = std::make_shared<LatticeWalberla>(
-      Utils::Vector3i{12, 12, 12}, node_grid_original, n_ghost_layers);
+      Utils::Vector3i{12, 12, 12}, node_grid_original, node_grid_original,
+      n_ghost_layers);
   auto const ptr = new_lb_walberla_cpu(lattice, 1.0, 1.0, false);
   ptr->set_collision_model(0.0, 0);
   ::communicator.node_grid = node_grid_original;

diff --git a/src/python/espressomd/detail/walberla.py b/src/python/espressomd/detail/walberla.py
@@ -47,7 +47,7 @@ def __init__(self, *args, **kwargs):
             super().__init__(**kwargs)
 
     def valid_keys(self):
-        return {"agrid", "n_ghost_layers"}
+        return {"agrid", "n_ghost_layers", "blocks_per_mpi_rank"}
 
     def required_keys(self):
         return self.valid_keys()

diff --git a/src/python/espressomd/lb.py b/src/python/espressomd/lb.py
@@ -58,14 +58,14 @@ def validate_params(self, params):
 
     def valid_keys(self):
         return {"agrid", "tau", "density", "ext_force_density",
-                "kinematic_viscosity", "lattice", "kT", "seed"}
+                "kinematic_viscosity", "lattice", "kT", "seed", "blocks_per_mpi_rank"}
 
     def required_keys(self):
         return {"lattice", "density", "kinematic_viscosity", "tau"}
 
     def default_params(self):
         return {"lattice": None, "seed": 0, "kT": 0.,
-                "ext_force_density": [0.0, 0.0, 0.0]}
+                "ext_force_density": [0.0, 0.0, 0.0], "blocks_per_mpi_rank": [1, 1, 1]}
 
     def mach_limit(self):
         """
@@ -141,6 +141,8 @@ class LBFluidWalberla(HydrodynamicInteraction,
         Required for a thermalized fluid. Must be positive.
     single_precision : :obj:`bool`, optional
         Use single-precision floating-point arithmetic.
+    blocks_per_mpi_rank : (3,) array_like of :obj:`int`, optional
+        Ditribute more than one block to each CPU.
 
     Methods
     -------
@@ -240,7 +242,7 @@ def validate_params(self, params):
             if "agrid" not in params:
                 raise ValueError("missing argument 'lattice' or 'agrid'")
             params["lattice"] = LatticeWalberla(
-                agrid=params.pop("agrid"), n_ghost_layers=1)
+                agrid=params.pop("agrid"), n_ghost_layers=1, blocks_per_mpi_rank=params.get("blocks_per_mpi_rank"))
         elif "agrid" in params:
             raise ValueError("cannot provide both 'lattice' and 'agrid'")
 

diff --git a/src/script_interface/walberla/LBFluid.cpp b/src/script_interface/walberla/LBFluid.cpp
@@ -139,6 +139,12 @@ void LBFluidGPU::make_instance(VariantMap const &params) {
   auto const visc = get_value<double>(params, "kinematic_viscosity");
   auto const dens = get_value<double>(params, "density");
   auto const precision = get_value<bool>(params, "single_precision");
+  auto const blocks_per_mpi_rank = get_value_or<Utils::Vector3i>(
+      params, "blocks_per_mpi_rank", Utils::Vector3i{{1, 1, 1}});
+  if (blocks_per_mpi_rank != Utils::Vector3i{{1, 1, 1}}) {
+    throw std::runtime_error(
+        "GPU architecture PROHIBITED allocating many blocks to 1 CPU.");
+  }
   auto const lb_lattice = m_lattice->lattice();
   auto const lb_visc = m_conv_visc * visc;
   auto const lb_dens = m_conv_dens * dens;

diff --git a/src/script_interface/walberla/LatticeWalberla.hpp b/src/script_interface/walberla/LatticeWalberla.hpp
@@ -43,6 +43,7 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
   std::shared_ptr<::LatticeWalberla> m_lattice;
   double m_agrid;
   Utils::Vector3d m_box_l;
+  Utils::Vector3i m_blocks_per_mpi_rank;
 
 public:
   LatticeWalberla() {
@@ -53,14 +54,25 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
         {"shape", AutoParameter::read_only,
          [this]() { return m_lattice->get_grid_dimensions(); }},
         {"_box_l", AutoParameter::read_only, [this]() { return m_box_l; }},
+        {"blocks_per_mpi_rank", AutoParameter::read_only,
+         [this]() { return m_blocks_per_mpi_rank; }},
     });
   }
 
   void do_construct(VariantMap const &args) override {
     auto const &box_geo = *::System::get_system().box_geo;
     m_agrid = get_value<double>(args, "agrid");
     m_box_l = get_value_or<Utils::Vector3d>(args, "_box_l", box_geo.length());
+    m_blocks_per_mpi_rank = get_value_or<Utils::Vector3i>(
+        args, "blocks_per_mpi_rank", Utils::Vector3i{{1, 1, 1}});
     auto const n_ghost_layers = get_value<int>(args, "n_ghost_layers");
+    auto const block_grid =
+        Utils::Vector3i{{static_cast<int>(::communicator.node_grid[0] *
+                                          m_blocks_per_mpi_rank[0]),
+                         static_cast<int>(::communicator.node_grid[1] *
+                                          m_blocks_per_mpi_rank[1]),
+                         static_cast<int>(::communicator.node_grid[2] *
+                                          m_blocks_per_mpi_rank[2])}};
 
     context()->parallel_try_catch([&]() {
       if (m_agrid <= 0.) {
@@ -72,7 +84,7 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
       auto const grid_dim =
           ::LatticeWalberla::calc_grid_dimensions(m_box_l, m_agrid);
       m_lattice = std::make_shared<::LatticeWalberla>(
-          grid_dim, ::communicator.node_grid,
+          grid_dim, ::communicator.node_grid, block_grid,
           static_cast<unsigned int>(n_ghost_layers));
     });
   }

diff --git a/src/walberla_bridge/include/walberla_bridge/LatticeWalberla.hpp b/src/walberla_bridge/include/walberla_bridge/LatticeWalberla.hpp
@@ -52,6 +52,7 @@ class LatticeWalberla {
 public:
   LatticeWalberla(Utils::Vector3i const &grid_dimensions,
                   Utils::Vector3i const &node_grid,
+                  Utils::Vector3i const &block_grid,
                   unsigned int n_ghost_layers);
 
   // Grid, domain, halo

diff --git a/src/walberla_bridge/src/BoundaryPackInfo.hpp b/src/walberla_bridge/src/BoundaryPackInfo.hpp
@@ -96,7 +96,7 @@ class BoundaryPackInfo : public PackInfo<GhostLayerField_T> {
     WALBERLA_ASSERT_EQUAL(bSize, buf_size);
 #endif
 
-    auto const offset = std::get<0>(m_lattice->get_local_grid_range());
+    auto const offset = to_vector3i(receiver->getAABB().min());
     typename Boundary_T::value_type value;
     for (auto it = begin(flag_field); it != flag_field->end(); ++it) {
       if (isFlagSet(it, boundary_flag)) {
@@ -133,7 +133,7 @@ class BoundaryPackInfo : public PackInfo<GhostLayerField_T> {
            << buf_size;
 #endif
 
-    auto const offset = std::get<0>(m_lattice->get_local_grid_range());
+    auto const offset = to_vector3i(sender->getAABB().min());
     for (auto it = begin(flag_field); it != flag_field->end(); ++it) {
       if (isFlagSet(it, boundary_flag)) {
         auto const node = offset + Utils::Vector3i{{it.x(), it.y(), it.z()}};

diff --git a/src/walberla_bridge/src/LatticeWalberla.cpp b/src/walberla_bridge/src/LatticeWalberla.cpp
@@ -40,6 +40,7 @@
 
 LatticeWalberla::LatticeWalberla(Utils::Vector3i const &grid_dimensions,
                                  Utils::Vector3i const &node_grid,
+                                 Utils::Vector3i const &block_grid,
                                  unsigned int n_ghost_layers)
     : m_grid_dimensions{grid_dimensions}, m_n_ghost_layers{n_ghost_layers} {
   using walberla::real_t;
@@ -50,21 +51,28 @@ LatticeWalberla::LatticeWalberla(Utils::Vector3i const &grid_dimensions,
       throw std::runtime_error(
           "Lattice grid dimensions and MPI node grid are not compatible.");
     }
+    if (m_grid_dimensions[i] % block_grid[i] != 0) {
+      throw std::runtime_error(
+          "Lattice grid dimensions and block grid are not compatible.");
+    }
   }
 
   auto constexpr lattice_constant = real_t{1};
-  auto const cells_block = Utils::hadamard_division(grid_dimensions, node_grid);
+  auto const cells_block =
+      Utils::hadamard_division(grid_dimensions, block_grid);
 
   m_blocks = walberla::blockforest::createUniformBlockGrid(
       // number of blocks in each direction
-      uint_c(node_grid[0]), uint_c(node_grid[1]), uint_c(node_grid[2]),
+      uint_c(block_grid[0]), uint_c(block_grid[1]), uint_c(block_grid[2]),
       // number of cells per block in each direction
       uint_c(cells_block[0]), uint_c(cells_block[1]), uint_c(cells_block[2]),
       lattice_constant,
       // number of cpus per direction
       uint_c(node_grid[0]), uint_c(node_grid[1]), uint_c(node_grid[2]),
       // periodicity
-      true, true, true);
+      true, true, true,
+      // keep global block information
+      false);
   for (IBlock &block : *m_blocks) {
     m_cached_blocks.push_back(&block);
   }
@@ -73,11 +81,44 @@ LatticeWalberla::LatticeWalberla(Utils::Vector3i const &grid_dimensions,
 [[nodiscard]] std::pair<Utils::Vector3d, Utils::Vector3d>
 LatticeWalberla::get_local_domain() const {
   using walberla::to_vector3d;
-  // We only have one block per mpi rank
-  assert(++(m_blocks->begin()) == m_blocks->end());
-
-  auto const ab = m_blocks->begin()->getAABB();
-  return {to_vector3d(ab.min()), to_vector3d(ab.max())};
+  // Get upper and lower corner of BlockForest assigned to a mpi rank.
+  // Since we can allocate multiple blocks per mpi rank,
+  // the corners of all Blocks are compared.
+  int64_t const stride_y = m_grid_dimensions[2];
+  int64_t const stride_x = m_grid_dimensions[1] * stride_y;
+  auto aa = m_blocks->begin()->getAABB();
+  auto bb = m_blocks->begin()->getAABB();
+  int64_t aa_index = stride_x * static_cast<int>(aa.min()[0]) +
+                     stride_y * static_cast<int>(aa.min()[1]) +
+                     static_cast<int>(aa.min()[2]);
+  int64_t bb_index = stride_x * static_cast<int>(bb.max()[0]) +
+                     stride_y * static_cast<int>(bb.max()[1]) +
+                     static_cast<int>(bb.max()[2]);
+  for (auto b = m_blocks->begin(); b != m_blocks->end(); ++b) {
+    auto cc = b->getAABB();
+    for (auto const i : {0u, 1u, 2u}) {
+      if ((cc.max()[i] - cc.min()[i]) != 0) {
+        assert(m_grid_dimensions[i] %
+                   static_cast<int>(cc.max()[i] - cc.min()[i]) ==
+               0);
+      }
+    }
+    int64_t min_index = stride_x * static_cast<int>(cc.min()[0]) +
+                        stride_y * static_cast<int>(cc.min()[1]) +
+                        static_cast<int>(cc.min()[2]);
+    int64_t max_index = stride_x * static_cast<int>(cc.max()[0]) +
+                        stride_y * static_cast<int>(cc.max()[1]) +
+                        static_cast<int>(cc.max()[2]);
+    if (min_index < aa_index) {
+      aa = cc;
+      aa_index = min_index;
+    }
+    if (max_index > bb_index) {
+      bb = cc;
+      bb_index = max_index;
+    }
+  }
+  return {to_vector3d(aa.min()), to_vector3d(bb.max())};
 }
 
 [[nodiscard]] bool