espressomd · hidekb · Nov 6, 2024 · Nov 6, 2024 · Jan 7, 2025 · Jan 8, 2025
diff --git a/maintainer/benchmarks/lb.py b/maintainer/benchmarks/lb.py
@@ -50,6 +50,9 @@
 parser.add_argument("--output", metavar="FILEPATH", action="store",
                     type=str, required=False, default="benchmarks.csv",
                     help="Output file (default: benchmarks.csv)")
+parser.add_argument("--blocks_per_mpi_rank", action="store", nargs=3,
+                    type=int, default=[1, 1, 1], required=False,
+                    help="blocks per mpi rank")
 
 args = parser.parse_args()
 
@@ -105,6 +108,8 @@
 print(f"LB shape: {lb_grid}")
 print(f"LB agrid: {agrid:.3f}")
 
+blocks_per_mpi_rank = args.blocks_per_mpi_rank
+
 # System
 #############################################################
 system.box_l = box_l
@@ -145,7 +150,7 @@
 if args.multi_gpu:
     system.cuda_init_handle.call_method("set_device_id_per_rank")
 lbf = lb_class(agrid=agrid, tau=system.time_step, kinematic_viscosity=1.,
-               density=1., single_precision=args.single_precision)
+               density=1., single_precision=args.single_precision, blocks_per_mpi_rank=blocks_per_mpi_rank)
 system.lb = lbf
 if n_part:
     system.thermostat.set_lb(LB_fluid=lbf, gamma=1., seed=42)

diff --git a/maintainer/benchmarks/lb_weakscaling.py b/maintainer/benchmarks/lb_weakscaling.py
@@ -0,0 +1,166 @@
+#
+# Copyright (C) 2013-2022 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+"""
+Benchmark Lattice-Boltzmann fluid + Lennard-Jones particles.
+"""
+import espressomd
+import espressomd.lb
+import benchmarks
+import numpy as np
+import argparse
+
+parser = argparse.ArgumentParser(description="Benchmark LB simulations. "
+                                 "Save the results to a CSV file.")
+parser.add_argument("--particles_per_core", metavar="N", action="store",
+                    type=int, default=125, required=False,
+                    help="Number of particles per core")
+parser.add_argument("--box_l", action="store", nargs="+",
+                    type=int, default=argparse.SUPPRESS, required=False,
+                    help="Box length (cubic box)")
+parser.add_argument("--lb_sites_per_particle", metavar="N_LB", action="store",
+                    type=float, default=28, required=False,
+                    help="Number of LB sites per particle")
+parser.add_argument("--volume_fraction", metavar="FRAC", action="store",
+                    type=float, default=0.03, required=False,
+                    help="Fraction of the simulation box volume occupied by "
+                    "particles (range: [0.01-0.74], default: 0.03)")
+parser.add_argument("--single_precision", action="store_true", required=False,
+                    help="Using single-precision floating point accuracy")
+parser.add_argument("--gpu", action=argparse.BooleanOptionalAction,
+                    default=False, required=False, help="Use GPU implementation")
+parser.add_argument("--multi-gpu", action=argparse.BooleanOptionalAction,
+                    default=False, required=False, help="Use multi-GPU implementation")
+parser.add_argument("--output", metavar="FILEPATH", action="store",
+                    type=str, required=False, default="benchmarks.csv",
+                    help="Output file (default: benchmarks.csv)")
+parser.add_argument("--blocks_per_mpi_rank", action="store", nargs=3,
+                    type=int, default=[1, 1, 1], required=False,
+                    help="blocks per mpi rank")
+
+args = parser.parse_args()
+
+# process and check arguments
+n_iterations = 30
+assert args.volume_fraction > 0, "--volume_fraction must be a positive number"
+assert args.volume_fraction < np.pi / (3 * np.sqrt(2)), \
+    "--volume_fraction exceeds the physical limit of sphere packing (~0.74)"
+assert "box_l" not in args or args.particles_per_core == 0, \
+    "Argument --box_l requires --particles_per_core=0"
+
+required_features = ["LENNARD_JONES", "WALBERLA"]
+if args.gpu:
+    required_features.append("CUDA")
+espressomd.assert_features(required_features)
+
+# make simulation deterministic
+np.random.seed(42)
+
+# System
+#############################################################
+system = espressomd.System(box_l=[1, 1, 1])
+
+# Interaction parameters (Lennard-Jones)
+#############################################################
+
+lj_eps = 1.0  # LJ epsilon
+lj_sig = 1.0  # particle diameter
+lj_cut = lj_sig * 2**(1. / 6.)  # cutoff distance
+
+# System parameters
+#############################################################
+n_proc = system.cell_system.get_state()["n_nodes"]
+n_part = n_proc * args.particles_per_core
+if n_part == 0:
+    box_l = 3 * args.box_l if len(args.box_l) == 1 else args.box_l
+    agrid = 1.
+    lb_grid = box_l
+    measurement_steps = 80
+else:
+    # volume of N spheres with radius r: N * (4/3*pi*r^3)
+    box_l = (n_part * 4. / 3. * np.pi * (lj_sig / 2.)**3
+             / args.volume_fraction)**(1. / 3.)
+    lb_grid = (n_part * args.lb_sites_per_particle)**(1. / 3.)
+    lb_grid = int(2. * round(lb_grid / 2.))
+    agrid = box_l / lb_grid
+    measurement_steps = max(50, int(120**3 / lb_grid**3))
+    measurement_steps = 40
+    lb_grid = 3 * [lb_grid]
+    box_l = 3 * [box_l]
+
+blocks_per_mpi_rank = args.blocks_per_mpi_rank
+
+# System
+#############################################################
+system.box_l = box_l * system.cell_system.node_grid
+print(f"box length: {system.box_l}")
+print(f"LB shape: {lb_grid}")
+print(f"LB agrid: {agrid:.3f}")
+
+# Integration parameters
+#############################################################
+system.time_step = 0.01
+system.cell_system.skin = 0.5
+
+# Interaction and particle setup
+#############################################################
+if n_part:
+    system.non_bonded_inter[0, 0].lennard_jones.set_params(
+        epsilon=lj_eps, sigma=lj_sig, cutoff=lj_cut, shift="auto")
+    system.part.add(pos=np.random.random((n_part, 3)) * system.box_l)
+    benchmarks.minimize(system, n_part / 2.)
+    system.integrator.set_vv()
+    system.thermostat.set_langevin(kT=1.0, gamma=1.0, seed=42)
+
+    # tuning and equilibration
+    min_skin = 0.2
+    max_skin = 1.0
+    print("Tune skin: {:.3f}".format(system.cell_system.tune_skin(
+        min_skin=min_skin, max_skin=max_skin, tol=0.05, int_steps=100)))
+    print("Equilibration")
+    system.integrator.run(500)
+    print("Tune skin: {:.3f}".format(system.cell_system.tune_skin(
+        min_skin=min_skin, max_skin=max_skin, tol=0.05, int_steps=100)))
+    print("Equilibration")
+    system.integrator.run(500)
+    system.thermostat.turn_off()
+
+# LB fluid setup
+#############################################################
+lb_class = espressomd.lb.LBFluidWalberla
+if args.gpu or args.multi_gpu:
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+if args.multi_gpu:
+    system.cuda_init_handle.call_method("set_device_id_per_rank")
+lbf = lb_class(agrid=agrid, tau=system.time_step, kinematic_viscosity=1.,
+               density=1., single_precision=args.single_precision, blocks_per_mpi_rank=blocks_per_mpi_rank)
+system.lb = lbf
+if n_part:
+    system.thermostat.set_lb(LB_fluid=lbf, gamma=1., seed=42)
+
+
+# time integration loop
+timings = benchmarks.get_timings(system, measurement_steps, n_iterations)
+
+# average time
+avg, ci = benchmarks.get_average_time(timings)
+print(f"average: {1000 * avg:.2f} +/- {1000 * ci:.2f} ms (95% C.I.)")
+
+# write report
+benchmarks.write_report(args.output, n_proc, timings, measurement_steps)
diff --git a/src/core/integrate.cpp b/src/core/integrate.cpp
@@ -633,12 +633,18 @@ int System::System::integrate(int n_steps, int reuse_forces) {
           ek.propagate();
         }
       } else if (lb_active) {
+#ifdef CALIPER
+        CALI_MARK_BEGIN("LB.PROPAGATE");
+#endif
         auto const md_steps_per_lb_step = calc_md_steps_per_tau(lb.get_tau());
         propagation.lb_skipped_md_steps += 1;
         if (propagation.lb_skipped_md_steps >= md_steps_per_lb_step) {
           propagation.lb_skipped_md_steps = 0;
           lb.propagate();
         }
+#ifdef CALIPER
+        CALI_MARK_END("LB.PROPAGATE");
+#endif
       } else if (ek_active) {
         auto const md_steps_per_ek_step = calc_md_steps_per_tau(ek.get_tau());
         propagation.ek_skipped_md_steps += 1;

diff --git a/src/core/lb/LBWalberla.cpp b/src/core/lb/LBWalberla.cpp
@@ -40,6 +40,10 @@
 #include <optional>
 #include <variant>
 
+#ifdef CALIPER
+#include <caliper/cali.h>
+#endif
+
 namespace LB {
 
 bool LBWalberla::is_gpu() const { return lb_fluid->is_gpu(); }
@@ -50,7 +54,15 @@ Utils::VectorXd<9> LBWalberla::get_pressure_tensor() const {
   return lb_fluid->get_pressure_tensor();
 }
 
-void LBWalberla::propagate() { lb_fluid->integrate(); }
+void LBWalberla::propagate() {
+#ifdef CALIPER
+  CALI_MARK_BEGIN("LBWalberla.PROPAGATE");
+#endif
+  lb_fluid->integrate();
+#ifdef CALIPER
+  CALI_MARK_END("LBWalberla.PROPAGATE");
+#endif
+}
 
 void LBWalberla::ghost_communication() { lb_fluid->ghost_communication(); }
 

diff --git a/src/core/lb/Solver.cpp b/src/core/lb/Solver.cpp
@@ -47,6 +47,10 @@
 #include <variant>
 #include <vector>
 
+#ifdef CALIPER
+#include <caliper/cali.h>
+#endif
+
 namespace LB {
 
 Solver::Solver() { impl = std::make_unique<Implementation>(); }
@@ -69,8 +73,14 @@ void Solver::reset() {
 }
 
 void Solver::propagate() {
+#ifdef CALIPER
+  CALI_MARK_BEGIN("SOLVER.PROPAGATE");
+#endif
   check_solver(impl);
   std::visit([](auto &ptr) { ptr->propagate(); }, *impl->solver);
+#ifdef CALIPER
+  CALI_MARK_END("SOLVER.PROPAGATE");
+#endif
 }
 
 void Solver::ghost_communication() {

diff --git a/src/core/unit_tests/ek_interface_test.cpp b/src/core/unit_tests/ek_interface_test.cpp
@@ -83,7 +83,8 @@ static auto make_ek_actor() {
   auto constexpr n_ghost_layers = 1u;
   auto constexpr single_precision = true;
   ek_lattice = std::make_shared<LatticeWalberla>(
-      params.grid_dimensions, ::communicator.node_grid, n_ghost_layers);
+      params.grid_dimensions, ::communicator.node_grid,
+      ::communicator.node_grid, n_ghost_layers);
   ek_container = std::make_shared<EK::EKWalberla::ek_container_type>(
       params.tau, walberla::new_ek_poisson_none(ek_lattice, single_precision));
   ek_reactions = std::make_shared<EK::EKWalberla::ek_reactions_type>();

diff --git a/src/core/unit_tests/lb_particle_coupling_test.cpp b/src/core/unit_tests/lb_particle_coupling_test.cpp
@@ -102,7 +102,8 @@ static auto make_lb_actor() {
   auto constexpr single_precision = false;
   lb_params = std::make_shared<LB::LBWalberlaParams>(params.agrid, params.tau);
   lb_lattice = std::make_shared<LatticeWalberla>(
-      params.grid_dimensions, ::communicator.node_grid, n_ghost_layers);
+      params.grid_dimensions, ::communicator.node_grid,
+      ::communicator.node_grid, n_ghost_layers);
   lb_fluid = new_lb_walberla_cpu(lb_lattice, params.viscosity, params.density,
                                  single_precision);
   lb_fluid->set_collision_model(params.kT, params.seed);
@@ -535,7 +536,8 @@ bool test_lb_domain_mismatch_local() {
   auto const params = std::make_shared<LB::LBWalberlaParams>(0.5, 0.01);
   ::communicator.node_grid = node_grid_reversed;
   auto const lattice = std::make_shared<LatticeWalberla>(
-      Utils::Vector3i{12, 12, 12}, node_grid_original, n_ghost_layers);
+      Utils::Vector3i{12, 12, 12}, node_grid_original, node_grid_original,
+      n_ghost_layers);
   auto const ptr = new_lb_walberla_cpu(lattice, 1.0, 1.0, false);
   ptr->set_collision_model(0.0, 0);
   ::communicator.node_grid = node_grid_original;

diff --git a/src/python/espressomd/detail/walberla.py b/src/python/espressomd/detail/walberla.py
@@ -47,7 +47,7 @@ def __init__(self, *args, **kwargs):
             super().__init__(**kwargs)
 
     def valid_keys(self):
-        return {"agrid", "n_ghost_layers"}
+        return {"agrid", "n_ghost_layers", "blocks_per_mpi_rank"}
 
     def required_keys(self):
         return self.valid_keys()

diff --git a/src/python/espressomd/lb.py b/src/python/espressomd/lb.py
@@ -58,14 +58,14 @@ def validate_params(self, params):
 
     def valid_keys(self):
         return {"agrid", "tau", "density", "ext_force_density",
-                "kinematic_viscosity", "lattice", "kT", "seed"}
+                "kinematic_viscosity", "lattice", "kT", "seed", "blocks_per_mpi_rank"}
 
     def required_keys(self):
         return {"lattice", "density", "kinematic_viscosity", "tau"}
 
     def default_params(self):
         return {"lattice": None, "seed": 0, "kT": 0.,
-                "ext_force_density": [0.0, 0.0, 0.0]}
+                "ext_force_density": [0.0, 0.0, 0.0], "blocks_per_mpi_rank": [1, 1, 1]}
 
     def mach_limit(self):
         """
@@ -141,6 +141,8 @@ class LBFluidWalberla(HydrodynamicInteraction,
         Required for a thermalized fluid. Must be positive.
     single_precision : :obj:`bool`, optional
         Use single-precision floating-point arithmetic.
+    blocks_per_mpi_rank : (3,) array_like of :obj:`int`, optional
+        Ditribute more than one block to each CPU.
 
     Methods
     -------
@@ -240,7 +242,7 @@ def validate_params(self, params):
             if "agrid" not in params:
                 raise ValueError("missing argument 'lattice' or 'agrid'")
             params["lattice"] = LatticeWalberla(
-                agrid=params.pop("agrid"), n_ghost_layers=1)
+                agrid=params.pop("agrid"), n_ghost_layers=1, blocks_per_mpi_rank=params.get("blocks_per_mpi_rank"))
         elif "agrid" in params:
             raise ValueError("cannot provide both 'lattice' and 'agrid'")
 

diff --git a/src/script_interface/walberla/LBFluid.cpp b/src/script_interface/walberla/LBFluid.cpp
@@ -139,6 +139,12 @@ void LBFluidGPU::make_instance(VariantMap const &params) {
   auto const visc = get_value<double>(params, "kinematic_viscosity");
   auto const dens = get_value<double>(params, "density");
   auto const precision = get_value<bool>(params, "single_precision");
+  auto const blocks_per_mpi_rank = get_value_or<Utils::Vector3i>(
+      params, "blocks_per_mpi_rank", Utils::Vector3i{{1, 1, 1}});
+  if (blocks_per_mpi_rank != Utils::Vector3i{{1, 1, 1}}) {
+    throw std::runtime_error(
+        "Using more than one block per MPI rank is not supported for GPU LB");
+  }
   auto const lb_lattice = m_lattice->lattice();
   auto const lb_visc = m_conv_visc * visc;
   auto const lb_dens = m_conv_dens * dens;

diff --git a/src/script_interface/walberla/LatticeWalberla.hpp b/src/script_interface/walberla/LatticeWalberla.hpp
@@ -43,6 +43,7 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
   std::shared_ptr<::LatticeWalberla> m_lattice;
   double m_agrid;
   Utils::Vector3d m_box_l;
+  Utils::Vector3i m_blocks_per_mpi_rank;
 
 public:
   LatticeWalberla() {
@@ -53,14 +54,25 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
         {"shape", AutoParameter::read_only,
          [this]() { return m_lattice->get_grid_dimensions(); }},
         {"_box_l", AutoParameter::read_only, [this]() { return m_box_l; }},
+        {"blocks_per_mpi_rank", AutoParameter::read_only,
+         [this]() { return m_blocks_per_mpi_rank; }},
     });
   }
 
   void do_construct(VariantMap const &args) override {
     auto const &box_geo = *::System::get_system().box_geo;
     m_agrid = get_value<double>(args, "agrid");
     m_box_l = get_value_or<Utils::Vector3d>(args, "_box_l", box_geo.length());
+    m_blocks_per_mpi_rank = get_value_or<Utils::Vector3i>(
+        args, "blocks_per_mpi_rank", Utils::Vector3i{{1, 1, 1}});
     auto const n_ghost_layers = get_value<int>(args, "n_ghost_layers");
+    auto const block_grid =
+        Utils::Vector3i{{static_cast<int>(::communicator.node_grid[0] *
+                                          m_blocks_per_mpi_rank[0]),
+                         static_cast<int>(::communicator.node_grid[1] *
+                                          m_blocks_per_mpi_rank[1]),
+                         static_cast<int>(::communicator.node_grid[2] *
+                                          m_blocks_per_mpi_rank[2])}};
 
     context()->parallel_try_catch([&]() {
       if (m_agrid <= 0.) {
@@ -72,7 +84,7 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
       auto const grid_dim =
           ::LatticeWalberla::calc_grid_dimensions(m_box_l, m_agrid);
       m_lattice = std::make_shared<::LatticeWalberla>(
-          grid_dim, ::communicator.node_grid,
+          grid_dim, ::communicator.node_grid, block_grid,
           static_cast<unsigned int>(n_ghost_layers));
     });
   }