Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allocating multiple blocks to one mpi rank in LB #5026

Draft
wants to merge 23 commits into
base: python
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion maintainer/benchmarks/lb.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@
parser.add_argument("--output", metavar="FILEPATH", action="store",
type=str, required=False, default="benchmarks.csv",
help="Output file (default: benchmarks.csv)")
parser.add_argument("--blocks_per_mpi_rank", action="store", nargs=3,
type=int, default=[1, 1, 1], required=False,
help="blocks per mpi rank")

args = parser.parse_args()

Expand Down Expand Up @@ -105,6 +108,8 @@
print(f"LB shape: {lb_grid}")
print(f"LB agrid: {agrid:.3f}")

blocks_per_mpi_rank = args.blocks_per_mpi_rank

jngrad marked this conversation as resolved.
Show resolved Hide resolved
# System
#############################################################
system.box_l = box_l
Expand Down Expand Up @@ -145,7 +150,7 @@
if args.multi_gpu:
system.cuda_init_handle.call_method("set_device_id_per_rank")
lbf = lb_class(agrid=agrid, tau=system.time_step, kinematic_viscosity=1.,
density=1., single_precision=args.single_precision)
density=1., single_precision=args.single_precision, blocks_per_mpi_rank=blocks_per_mpi_rank)
system.lb = lbf
if n_part:
system.thermostat.set_lb(LB_fluid=lbf, gamma=1., seed=42)
Expand Down
166 changes: 166 additions & 0 deletions maintainer/benchmarks/lb_weakscaling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
#
# Copyright (C) 2013-2022 The ESPResSo project
#
# This file is part of ESPResSo.
#
# ESPResSo is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# ESPResSo is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#

"""
Benchmark Lattice-Boltzmann fluid + Lennard-Jones particles.
"""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it be more sustainable if we only had one LB benchmark file to maintain? argparse is quite flexible, surely we can come up with a way to select strong vs. weak scaling with command line options?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I unified lb.py and lb_weakscaling.py by adding option --weak_scaling to argparse for lb.py

import espressomd
import espressomd.lb
import benchmarks
import numpy as np
import argparse

parser = argparse.ArgumentParser(description="Benchmark LB simulations. "
"Save the results to a CSV file.")
parser.add_argument("--particles_per_core", metavar="N", action="store",
type=int, default=125, required=False,
help="Number of particles per core")
parser.add_argument("--box_l", action="store", nargs="+",
type=int, default=argparse.SUPPRESS, required=False,
help="Box length (cubic box)")
parser.add_argument("--lb_sites_per_particle", metavar="N_LB", action="store",
type=float, default=28, required=False,
help="Number of LB sites per particle")
parser.add_argument("--volume_fraction", metavar="FRAC", action="store",
type=float, default=0.03, required=False,
help="Fraction of the simulation box volume occupied by "
"particles (range: [0.01-0.74], default: 0.03)")
parser.add_argument("--single_precision", action="store_true", required=False,
help="Using single-precision floating point accuracy")
parser.add_argument("--gpu", action=argparse.BooleanOptionalAction,
default=False, required=False, help="Use GPU implementation")
parser.add_argument("--multi-gpu", action=argparse.BooleanOptionalAction,
default=False, required=False, help="Use multi-GPU implementation")
parser.add_argument("--output", metavar="FILEPATH", action="store",
type=str, required=False, default="benchmarks.csv",
help="Output file (default: benchmarks.csv)")
parser.add_argument("--blocks_per_mpi_rank", action="store", nargs=3,
type=int, default=[1, 1, 1], required=False,
help="blocks per mpi rank")

args = parser.parse_args()

# process and check arguments
n_iterations = 30
assert args.volume_fraction > 0, "--volume_fraction must be a positive number"
assert args.volume_fraction < np.pi / (3 * np.sqrt(2)), \
"--volume_fraction exceeds the physical limit of sphere packing (~0.74)"
assert "box_l" not in args or args.particles_per_core == 0, \
"Argument --box_l requires --particles_per_core=0"

required_features = ["LENNARD_JONES", "WALBERLA"]
if args.gpu:
required_features.append("CUDA")
espressomd.assert_features(required_features)

# make simulation deterministic
np.random.seed(42)

# System
#############################################################
system = espressomd.System(box_l=[1, 1, 1])

# Interaction parameters (Lennard-Jones)
#############################################################

lj_eps = 1.0 # LJ epsilon
lj_sig = 1.0 # particle diameter
lj_cut = lj_sig * 2**(1. / 6.) # cutoff distance

# System parameters
#############################################################
n_proc = system.cell_system.get_state()["n_nodes"]
n_part = n_proc * args.particles_per_core
if n_part == 0:
box_l = 3 * args.box_l if len(args.box_l) == 1 else args.box_l
agrid = 1.
lb_grid = box_l
measurement_steps = 80
else:
# volume of N spheres with radius r: N * (4/3*pi*r^3)
box_l = (n_part * 4. / 3. * np.pi * (lj_sig / 2.)**3
/ args.volume_fraction)**(1. / 3.)
lb_grid = (n_part * args.lb_sites_per_particle)**(1. / 3.)
lb_grid = int(2. * round(lb_grid / 2.))
agrid = box_l / lb_grid
measurement_steps = max(50, int(120**3 / lb_grid**3))
measurement_steps = 40
lb_grid = 3 * [lb_grid]
box_l = 3 * [box_l]

blocks_per_mpi_rank = args.blocks_per_mpi_rank

# System
#############################################################
system.box_l = box_l * system.cell_system.node_grid
print(f"box length: {system.box_l}")
print(f"LB shape: {lb_grid}")
print(f"LB agrid: {agrid:.3f}")

# Integration parameters
#############################################################
system.time_step = 0.01
system.cell_system.skin = 0.5

# Interaction and particle setup
#############################################################
if n_part:
system.non_bonded_inter[0, 0].lennard_jones.set_params(
epsilon=lj_eps, sigma=lj_sig, cutoff=lj_cut, shift="auto")
system.part.add(pos=np.random.random((n_part, 3)) * system.box_l)
benchmarks.minimize(system, n_part / 2.)
system.integrator.set_vv()
system.thermostat.set_langevin(kT=1.0, gamma=1.0, seed=42)

# tuning and equilibration
min_skin = 0.2
max_skin = 1.0
print("Tune skin: {:.3f}".format(system.cell_system.tune_skin(
min_skin=min_skin, max_skin=max_skin, tol=0.05, int_steps=100)))
print("Equilibration")
system.integrator.run(500)
print("Tune skin: {:.3f}".format(system.cell_system.tune_skin(
min_skin=min_skin, max_skin=max_skin, tol=0.05, int_steps=100)))
print("Equilibration")
system.integrator.run(500)
system.thermostat.turn_off()

# LB fluid setup
#############################################################
lb_class = espressomd.lb.LBFluidWalberla
if args.gpu or args.multi_gpu:
lb_class = espressomd.lb.LBFluidWalberlaGPU
if args.multi_gpu:
system.cuda_init_handle.call_method("set_device_id_per_rank")
lbf = lb_class(agrid=agrid, tau=system.time_step, kinematic_viscosity=1.,
density=1., single_precision=args.single_precision, blocks_per_mpi_rank=blocks_per_mpi_rank)
system.lb = lbf
if n_part:
system.thermostat.set_lb(LB_fluid=lbf, gamma=1., seed=42)


# time integration loop
timings = benchmarks.get_timings(system, measurement_steps, n_iterations)

# average time
avg, ci = benchmarks.get_average_time(timings)
print(f"average: {1000 * avg:.2f} +/- {1000 * ci:.2f} ms (95% C.I.)")

# write report
benchmarks.write_report(args.output, n_proc, timings, measurement_steps)
6 changes: 6 additions & 0 deletions src/core/integrate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -633,12 +633,18 @@ int System::System::integrate(int n_steps, int reuse_forces) {
ek.propagate();
}
} else if (lb_active) {
#ifdef CALIPER
CALI_MARK_BEGIN("LB.PROPAGATE");
#endif
auto const md_steps_per_lb_step = calc_md_steps_per_tau(lb.get_tau());
propagation.lb_skipped_md_steps += 1;
if (propagation.lb_skipped_md_steps >= md_steps_per_lb_step) {
propagation.lb_skipped_md_steps = 0;
lb.propagate();
}
#ifdef CALIPER
CALI_MARK_END("LB.PROPAGATE");
#endif
} else if (ek_active) {
auto const md_steps_per_ek_step = calc_md_steps_per_tau(ek.get_tau());
propagation.ek_skipped_md_steps += 1;
Expand Down
14 changes: 13 additions & 1 deletion src/core/lb/LBWalberla.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@
#include <optional>
#include <variant>

#ifdef CALIPER
#include <caliper/cali.h>
#endif

namespace LB {

bool LBWalberla::is_gpu() const { return lb_fluid->is_gpu(); }
Expand All @@ -50,7 +54,15 @@ Utils::VectorXd<9> LBWalberla::get_pressure_tensor() const {
return lb_fluid->get_pressure_tensor();
}

void LBWalberla::propagate() { lb_fluid->integrate(); }
void LBWalberla::propagate() {
#ifdef CALIPER
CALI_MARK_BEGIN("LBWalberla.PROPAGATE");
#endif
lb_fluid->integrate();
#ifdef CALIPER
CALI_MARK_END("LBWalberla.PROPAGATE");
#endif
}

void LBWalberla::ghost_communication() { lb_fluid->ghost_communication(); }

Expand Down
10 changes: 10 additions & 0 deletions src/core/lb/Solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@
#include <variant>
#include <vector>

#ifdef CALIPER
#include <caliper/cali.h>
#endif

namespace LB {

Solver::Solver() { impl = std::make_unique<Implementation>(); }
Expand All @@ -69,8 +73,14 @@ void Solver::reset() {
}

void Solver::propagate() {
#ifdef CALIPER
CALI_MARK_BEGIN("SOLVER.PROPAGATE");
#endif
check_solver(impl);
std::visit([](auto &ptr) { ptr->propagate(); }, *impl->solver);
#ifdef CALIPER
CALI_MARK_END("SOLVER.PROPAGATE");
#endif
}

void Solver::ghost_communication() {
Expand Down
3 changes: 2 additions & 1 deletion src/core/unit_tests/ek_interface_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ static auto make_ek_actor() {
auto constexpr n_ghost_layers = 1u;
auto constexpr single_precision = true;
ek_lattice = std::make_shared<LatticeWalberla>(
params.grid_dimensions, ::communicator.node_grid, n_ghost_layers);
params.grid_dimensions, ::communicator.node_grid,
::communicator.node_grid, n_ghost_layers);
ek_container = std::make_shared<EK::EKWalberla::ek_container_type>(
params.tau, walberla::new_ek_poisson_none(ek_lattice, single_precision));
ek_reactions = std::make_shared<EK::EKWalberla::ek_reactions_type>();
Expand Down
6 changes: 4 additions & 2 deletions src/core/unit_tests/lb_particle_coupling_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,8 @@ static auto make_lb_actor() {
auto constexpr single_precision = false;
lb_params = std::make_shared<LB::LBWalberlaParams>(params.agrid, params.tau);
lb_lattice = std::make_shared<LatticeWalberla>(
params.grid_dimensions, ::communicator.node_grid, n_ghost_layers);
params.grid_dimensions, ::communicator.node_grid,
::communicator.node_grid, n_ghost_layers);
lb_fluid = new_lb_walberla_cpu(lb_lattice, params.viscosity, params.density,
single_precision);
lb_fluid->set_collision_model(params.kT, params.seed);
Expand Down Expand Up @@ -535,7 +536,8 @@ bool test_lb_domain_mismatch_local() {
auto const params = std::make_shared<LB::LBWalberlaParams>(0.5, 0.01);
::communicator.node_grid = node_grid_reversed;
auto const lattice = std::make_shared<LatticeWalberla>(
Utils::Vector3i{12, 12, 12}, node_grid_original, n_ghost_layers);
Utils::Vector3i{12, 12, 12}, node_grid_original, node_grid_original,
n_ghost_layers);
auto const ptr = new_lb_walberla_cpu(lattice, 1.0, 1.0, false);
ptr->set_collision_model(0.0, 0);
::communicator.node_grid = node_grid_original;
Expand Down
2 changes: 1 addition & 1 deletion src/python/espressomd/detail/walberla.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(self, *args, **kwargs):
super().__init__(**kwargs)

def valid_keys(self):
return {"agrid", "n_ghost_layers"}
return {"agrid", "n_ghost_layers", "blocks_per_mpi_rank"}

def required_keys(self):
return self.valid_keys()
Expand Down
8 changes: 5 additions & 3 deletions src/python/espressomd/lb.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,14 @@ def validate_params(self, params):

def valid_keys(self):
return {"agrid", "tau", "density", "ext_force_density",
"kinematic_viscosity", "lattice", "kT", "seed"}
"kinematic_viscosity", "lattice", "kT", "seed", "blocks_per_mpi_rank"}

def required_keys(self):
return {"lattice", "density", "kinematic_viscosity", "tau"}

def default_params(self):
return {"lattice": None, "seed": 0, "kT": 0.,
"ext_force_density": [0.0, 0.0, 0.0]}
"ext_force_density": [0.0, 0.0, 0.0], "blocks_per_mpi_rank": [1, 1, 1]}

def mach_limit(self):
"""
Expand Down Expand Up @@ -141,6 +141,8 @@ class LBFluidWalberla(HydrodynamicInteraction,
Required for a thermalized fluid. Must be positive.
single_precision : :obj:`bool`, optional
Use single-precision floating-point arithmetic.
blocks_per_mpi_rank : (3,) array_like of :obj:`int`, optional
Ditribute more than one block to each CPU.
jngrad marked this conversation as resolved.
Show resolved Hide resolved

Methods
-------
Expand Down Expand Up @@ -240,7 +242,7 @@ def validate_params(self, params):
if "agrid" not in params:
raise ValueError("missing argument 'lattice' or 'agrid'")
params["lattice"] = LatticeWalberla(
agrid=params.pop("agrid"), n_ghost_layers=1)
agrid=params.pop("agrid"), n_ghost_layers=1, blocks_per_mpi_rank=params.get("blocks_per_mpi_rank"))
elif "agrid" in params:
raise ValueError("cannot provide both 'lattice' and 'agrid'")

Expand Down
6 changes: 6 additions & 0 deletions src/script_interface/walberla/LBFluid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,12 @@ void LBFluidGPU::make_instance(VariantMap const &params) {
auto const visc = get_value<double>(params, "kinematic_viscosity");
auto const dens = get_value<double>(params, "density");
auto const precision = get_value<bool>(params, "single_precision");
auto const blocks_per_mpi_rank = get_value_or<Utils::Vector3i>(
params, "blocks_per_mpi_rank", Utils::Vector3i{{1, 1, 1}});
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here and elsewhere, do we really need a default value, considering we already provide a default value in the python class?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added default value for the python class LatticeWalberla. And blocks_per_mpi_rank in LBFluid is settle by get_value<Utils::Vector3i>(m_lattice->get_parameter("blocks_per_mpi_rank")).

if (blocks_per_mpi_rank != Utils::Vector3i{{1, 1, 1}}) {
throw std::runtime_error(
"Using more than one block per MPI rank is not supported for GPU LB");
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Using more than one block per MPI rank is not supported for GPU LB" (but why, actually?)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how about "GPU LB only uses 1 block per MPI rank"?

auto const lb_lattice = m_lattice->lattice();
auto const lb_visc = m_conv_visc * visc;
auto const lb_dens = m_conv_dens * dens;
Expand Down
14 changes: 13 additions & 1 deletion src/script_interface/walberla/LatticeWalberla.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
std::shared_ptr<::LatticeWalberla> m_lattice;
double m_agrid;
Utils::Vector3d m_box_l;
Utils::Vector3i m_blocks_per_mpi_rank;

public:
LatticeWalberla() {
Expand All @@ -53,14 +54,25 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
{"shape", AutoParameter::read_only,
[this]() { return m_lattice->get_grid_dimensions(); }},
{"_box_l", AutoParameter::read_only, [this]() { return m_box_l; }},
{"blocks_per_mpi_rank", AutoParameter::read_only,
[this]() { return m_blocks_per_mpi_rank; }},
});
}

void do_construct(VariantMap const &args) override {
auto const &box_geo = *::System::get_system().box_geo;
m_agrid = get_value<double>(args, "agrid");
m_box_l = get_value_or<Utils::Vector3d>(args, "_box_l", box_geo.length());
m_blocks_per_mpi_rank = get_value_or<Utils::Vector3i>(
args, "blocks_per_mpi_rank", Utils::Vector3i{{1, 1, 1}});
auto const n_ghost_layers = get_value<int>(args, "n_ghost_layers");
auto const block_grid =
Utils::Vector3i{{static_cast<int>(::communicator.node_grid[0] *
m_blocks_per_mpi_rank[0]),
static_cast<int>(::communicator.node_grid[1] *
m_blocks_per_mpi_rank[1]),
static_cast<int>(::communicator.node_grid[2] *
m_blocks_per_mpi_rank[2])}};
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Utils::hadamard_product()?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have rewritten it, as you suggest.


context()->parallel_try_catch([&]() {
if (m_agrid <= 0.) {
Expand All @@ -72,7 +84,7 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
auto const grid_dim =
::LatticeWalberla::calc_grid_dimensions(m_box_l, m_agrid);
m_lattice = std::make_shared<::LatticeWalberla>(
grid_dim, ::communicator.node_grid,
grid_dim, ::communicator.node_grid, block_grid,
static_cast<unsigned int>(n_ghost_layers));
});
}
Expand Down
Loading
Loading