Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allocating multiple blocks to one mpi rank in LB #5026

Draft
wants to merge 23 commits into
base: python
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions maintainer/benchmarks/lb.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@
parser.add_argument("--output", metavar="FILEPATH", action="store",
type=str, required=False, default="benchmarks.csv",
help="Output file (default: benchmarks.csv)")
parser.add_argument("--blocks_per_mpi_rank", action="store", nargs=3,
type=int, default=[1, 1, 1], required=False,
help="blocks per mpi rank")
parser.add_argument("--weak_scaling", action="store_true", required=False,
help="The measurement of weak scaling")

args = parser.parse_args()

Expand Down Expand Up @@ -101,13 +106,15 @@
lb_grid = 3 * [lb_grid]
box_l = 3 * [box_l]

print(f"box length: {box_l}")
print(f"LB shape: {lb_grid}")
print(f"LB agrid: {agrid:.3f}")

# System
#############################################################
system.box_l = box_l
if args.weak_scaling:
system.box_l = box_l * system.cell_system.node_grid
print(f"box length: {system.box_l}")
print(f"LB shape: {lb_grid}")
print(f"LB agrid: {agrid:.3f}")


# Integration parameters
#############################################################
Expand Down Expand Up @@ -145,7 +152,7 @@
if args.multi_gpu:
system.cuda_init_handle.call_method("set_device_id_per_rank")
lbf = lb_class(agrid=agrid, tau=system.time_step, kinematic_viscosity=1.,
density=1., single_precision=args.single_precision)
density=1., single_precision=args.single_precision, blocks_per_mpi_rank=args.blocks_per_mpi_rank)
system.lb = lbf
if n_part:
system.thermostat.set_lb(LB_fluid=lbf, gamma=1., seed=42)
Expand Down
6 changes: 6 additions & 0 deletions src/core/integrate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -633,12 +633,18 @@ int System::System::integrate(int n_steps, int reuse_forces) {
ek.propagate();
}
} else if (lb_active) {
#ifdef CALIPER
CALI_MARK_BEGIN("LB.PROPAGATE");
#endif
auto const md_steps_per_lb_step = calc_md_steps_per_tau(lb.get_tau());
propagation.lb_skipped_md_steps += 1;
if (propagation.lb_skipped_md_steps >= md_steps_per_lb_step) {
propagation.lb_skipped_md_steps = 0;
lb.propagate();
}
#ifdef CALIPER
CALI_MARK_END("LB.PROPAGATE");
#endif
} else if (ek_active) {
auto const md_steps_per_ek_step = calc_md_steps_per_tau(ek.get_tau());
propagation.ek_skipped_md_steps += 1;
Expand Down
14 changes: 13 additions & 1 deletion src/core/lb/LBWalberla.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@
#include <optional>
#include <variant>

#ifdef CALIPER
#include <caliper/cali.h>
#endif

namespace LB {

bool LBWalberla::is_gpu() const { return lb_fluid->is_gpu(); }
Expand All @@ -50,7 +54,15 @@ Utils::VectorXd<9> LBWalberla::get_pressure_tensor() const {
return lb_fluid->get_pressure_tensor();
}

void LBWalberla::propagate() { lb_fluid->integrate(); }
void LBWalberla::propagate() {
#ifdef CALIPER
CALI_MARK_BEGIN("LBWalberla.PROPAGATE");
#endif
lb_fluid->integrate();
#ifdef CALIPER
CALI_MARK_END("LBWalberla.PROPAGATE");
#endif
}

void LBWalberla::ghost_communication() { lb_fluid->ghost_communication(); }

Expand Down
10 changes: 10 additions & 0 deletions src/core/lb/Solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@
#include <variant>
#include <vector>

#ifdef CALIPER
#include <caliper/cali.h>
#endif

namespace LB {

Solver::Solver() { impl = std::make_unique<Implementation>(); }
Expand All @@ -69,8 +73,14 @@ void Solver::reset() {
}

void Solver::propagate() {
#ifdef CALIPER
CALI_MARK_BEGIN("SOLVER.PROPAGATE");
#endif
check_solver(impl);
std::visit([](auto &ptr) { ptr->propagate(); }, *impl->solver);
#ifdef CALIPER
CALI_MARK_END("SOLVER.PROPAGATE");
#endif
}

void Solver::ghost_communication() {
Expand Down
3 changes: 2 additions & 1 deletion src/core/unit_tests/ek_interface_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ static auto make_ek_actor() {
auto constexpr n_ghost_layers = 1u;
auto constexpr single_precision = true;
ek_lattice = std::make_shared<LatticeWalberla>(
params.grid_dimensions, ::communicator.node_grid, n_ghost_layers);
params.grid_dimensions, ::communicator.node_grid,
::communicator.node_grid, n_ghost_layers);
ek_container = std::make_shared<EK::EKWalberla::ek_container_type>(
params.tau, walberla::new_ek_poisson_none(ek_lattice, single_precision));
ek_reactions = std::make_shared<EK::EKWalberla::ek_reactions_type>();
Expand Down
6 changes: 4 additions & 2 deletions src/core/unit_tests/lb_particle_coupling_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,8 @@ static auto make_lb_actor() {
auto constexpr single_precision = false;
lb_params = std::make_shared<LB::LBWalberlaParams>(params.agrid, params.tau);
lb_lattice = std::make_shared<LatticeWalberla>(
params.grid_dimensions, ::communicator.node_grid, n_ghost_layers);
params.grid_dimensions, ::communicator.node_grid,
::communicator.node_grid, n_ghost_layers);
lb_fluid = new_lb_walberla_cpu(lb_lattice, params.viscosity, params.density,
single_precision);
lb_fluid->set_collision_model(params.kT, params.seed);
Expand Down Expand Up @@ -535,7 +536,8 @@ bool test_lb_domain_mismatch_local() {
auto const params = std::make_shared<LB::LBWalberlaParams>(0.5, 0.01);
::communicator.node_grid = node_grid_reversed;
auto const lattice = std::make_shared<LatticeWalberla>(
Utils::Vector3i{12, 12, 12}, node_grid_original, n_ghost_layers);
Utils::Vector3i{12, 12, 12}, node_grid_original, node_grid_original,
n_ghost_layers);
auto const ptr = new_lb_walberla_cpu(lattice, 1.0, 1.0, false);
ptr->set_collision_model(0.0, 0);
::communicator.node_grid = node_grid_original;
Expand Down
4 changes: 2 additions & 2 deletions src/python/espressomd/detail/walberla.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,13 @@ def __init__(self, *args, **kwargs):
super().__init__(**kwargs)

def valid_keys(self):
return {"agrid", "n_ghost_layers"}
return {"agrid", "n_ghost_layers", "blocks_per_mpi_rank"}

def required_keys(self):
return self.valid_keys()

def default_params(self):
return {}
return {"blocks_per_mpi_rank": [1, 1, 1]}

def get_node_indices_inside_shape(self, shape):
if not isinstance(shape, espressomd.shapes.Shape):
Expand Down
8 changes: 5 additions & 3 deletions src/python/espressomd/lb.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,14 @@ def validate_params(self, params):

def valid_keys(self):
return {"agrid", "tau", "density", "ext_force_density",
"kinematic_viscosity", "lattice", "kT", "seed"}
"kinematic_viscosity", "lattice", "kT", "seed", "blocks_per_mpi_rank"}

def required_keys(self):
return {"lattice", "density", "kinematic_viscosity", "tau"}

def default_params(self):
return {"lattice": None, "seed": 0, "kT": 0.,
"ext_force_density": [0.0, 0.0, 0.0]}
"ext_force_density": [0.0, 0.0, 0.0], "blocks_per_mpi_rank": [1, 1, 1]}

def mach_limit(self):
"""
Expand Down Expand Up @@ -141,6 +141,8 @@ class LBFluidWalberla(HydrodynamicInteraction,
Required for a thermalized fluid. Must be positive.
single_precision : :obj:`bool`, optional
Use single-precision floating-point arithmetic.
blocks_per_mpi_rank : (3,) array_like of :obj:`int`, optional
Distribute more than one block to each CPU.

Methods
-------
Expand Down Expand Up @@ -240,7 +242,7 @@ def validate_params(self, params):
if "agrid" not in params:
raise ValueError("missing argument 'lattice' or 'agrid'")
params["lattice"] = LatticeWalberla(
agrid=params.pop("agrid"), n_ghost_layers=1)
agrid=params.pop("agrid"), n_ghost_layers=1, blocks_per_mpi_rank=params.pop("blocks_per_mpi_rank"))
elif "agrid" in params:
raise ValueError("cannot provide both 'lattice' and 'agrid'")

Expand Down
6 changes: 6 additions & 0 deletions src/script_interface/walberla/LBFluid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,12 @@ void LBFluidGPU::make_instance(VariantMap const &params) {
auto const visc = get_value<double>(params, "kinematic_viscosity");
auto const dens = get_value<double>(params, "density");
auto const precision = get_value<bool>(params, "single_precision");
auto const blocks_per_mpi_rank = get_value<Utils::Vector3i>(
m_lattice->get_parameter("blocks_per_mpi_rank"));
if (blocks_per_mpi_rank != Utils::Vector3i{{1, 1, 1}}) {
throw std::runtime_error(
"Using more than one block per MPI rank is not supported for GPU LB");
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Using more than one block per MPI rank is not supported for GPU LB" (but why, actually?)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how about "GPU LB only uses 1 block per MPI rank"?

auto const lb_lattice = m_lattice->lattice();
auto const lb_visc = m_conv_visc * visc;
auto const lb_dens = m_conv_dens * dens;
Expand Down
10 changes: 8 additions & 2 deletions src/script_interface/walberla/LatticeWalberla.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
std::shared_ptr<::LatticeWalberla> m_lattice;
double m_agrid;
Utils::Vector3d m_box_l;
Utils::Vector3i m_blocks_per_mpi_rank;

public:
LatticeWalberla() {
Expand All @@ -53,15 +54,20 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
{"shape", AutoParameter::read_only,
[this]() { return m_lattice->get_grid_dimensions(); }},
{"_box_l", AutoParameter::read_only, [this]() { return m_box_l; }},
{"blocks_per_mpi_rank", AutoParameter::read_only,
[this]() { return m_blocks_per_mpi_rank; }},
});
}

void do_construct(VariantMap const &args) override {
auto const &box_geo = *::System::get_system().box_geo;
m_agrid = get_value<double>(args, "agrid");
m_box_l = get_value_or<Utils::Vector3d>(args, "_box_l", box_geo.length());
m_blocks_per_mpi_rank =
get_value<Utils::Vector3i>(args, "blocks_per_mpi_rank");
auto const n_ghost_layers = get_value<int>(args, "n_ghost_layers");

auto const block_grid = Utils::hadamard_product(::communicator.node_grid,
m_blocks_per_mpi_rank);
context()->parallel_try_catch([&]() {
if (m_agrid <= 0.) {
throw std::domain_error("Parameter 'agrid' must be > 0");
Expand All @@ -72,7 +78,7 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
auto const grid_dim =
::LatticeWalberla::calc_grid_dimensions(m_box_l, m_agrid);
m_lattice = std::make_shared<::LatticeWalberla>(
grid_dim, ::communicator.node_grid,
grid_dim, ::communicator.node_grid, block_grid,
static_cast<unsigned int>(n_ghost_layers));
});
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class LatticeWalberla {
public:
LatticeWalberla(Utils::Vector3i const &grid_dimensions,
Utils::Vector3i const &node_grid,
Utils::Vector3i const &block_grid,
unsigned int n_ghost_layers);

// Grid, domain, halo
Expand Down
4 changes: 2 additions & 2 deletions src/walberla_bridge/src/BoundaryPackInfo.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ class BoundaryPackInfo : public PackInfo<GhostLayerField_T> {
WALBERLA_ASSERT_EQUAL(bSize, buf_size);
#endif

auto const offset = std::get<0>(m_lattice->get_local_grid_range());
auto const offset = to_vector3i(receiver->getAABB().min());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it be better to have functions for this in the Lattice class, so they can be used by EK as well?
After all, LB and EK probably need to agree about both, the mpi and the block decompositoin?

typename Boundary_T::value_type value;
for (auto it = begin(flag_field); it != flag_field->end(); ++it) {
if (isFlagSet(it, boundary_flag)) {
Expand Down Expand Up @@ -133,7 +133,7 @@ class BoundaryPackInfo : public PackInfo<GhostLayerField_T> {
<< buf_size;
#endif

auto const offset = std::get<0>(m_lattice->get_local_grid_range());
auto const offset = to_vector3i(sender->getAABB().min());
for (auto it = begin(flag_field); it != flag_field->end(); ++it) {
if (isFlagSet(it, boundary_flag)) {
auto const node = offset + Utils::Vector3i{{it.x(), it.y(), it.z()}};
Expand Down
36 changes: 26 additions & 10 deletions src/walberla_bridge/src/LatticeWalberla.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@

LatticeWalberla::LatticeWalberla(Utils::Vector3i const &grid_dimensions,
Utils::Vector3i const &node_grid,
Utils::Vector3i const &block_grid,
unsigned int n_ghost_layers)
: m_grid_dimensions{grid_dimensions}, m_n_ghost_layers{n_ghost_layers} {
using walberla::real_t;
Expand All @@ -50,21 +51,28 @@ LatticeWalberla::LatticeWalberla(Utils::Vector3i const &grid_dimensions,
throw std::runtime_error(
"Lattice grid dimensions and MPI node grid are not compatible.");
}
if (m_grid_dimensions[i] % block_grid[i] != 0) {
throw std::runtime_error(
"Lattice grid dimensions and block grid are not compatible.");
}
}

auto constexpr lattice_constant = real_t{1};
auto const cells_block = Utils::hadamard_division(grid_dimensions, node_grid);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cells_per_block?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed cells_block to cells_per_block.

auto const cells_per_block =
Utils::hadamard_division(grid_dimensions, block_grid);

m_blocks = walberla::blockforest::createUniformBlockGrid(
// number of blocks in each direction
uint_c(node_grid[0]), uint_c(node_grid[1]), uint_c(node_grid[2]),
uint_c(block_grid[0]), uint_c(block_grid[1]), uint_c(block_grid[2]),
// number of cells per block in each direction
uint_c(cells_block[0]), uint_c(cells_block[1]), uint_c(cells_block[2]),
lattice_constant,
uint_c(cells_per_block[0]), uint_c(cells_per_block[1]),
uint_c(cells_per_block[2]), lattice_constant,
// number of cpus per direction
uint_c(node_grid[0]), uint_c(node_grid[1]), uint_c(node_grid[2]),
// periodicity
true, true, true);
true, true, true,
// keep global block information
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does this do/mean?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If "keep global block information" is true, each process keeps information about remote blocks that reside on other processes.

false);
for (IBlock &block : *m_blocks) {
m_cached_blocks.push_back(&block);
}
Expand All @@ -73,11 +81,19 @@ LatticeWalberla::LatticeWalberla(Utils::Vector3i const &grid_dimensions,
[[nodiscard]] std::pair<Utils::Vector3d, Utils::Vector3d>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we surr that there is no funciot in block forest or uniform blcok forest which supplies the answer to this?
If not, and we really have to calculate htis ourselves, it could probably be a bit more compact. Chatgpt came up with:

    // Initialize bounds to extreme values
    std::array<double, 3> globalMin = {std::numeric_limits<double>::max(),
                                       std::numeric_limits<double>::max(),
                                       std::numeric_limits<double>::max()};
    std::array<double, 3> globalMax = {std::numeric_limits<double>::lowest(),
                                       std::numeric_limits<double>::lowest(),
                                       std::numeric_limits<double>::lowest()};

    for (const auto& block : blocks) {
        const auto& blockMin = block.min();
        const auto& blockMax = block.max();
        for (size_t i = 0; i < 3; ++i) {
            globalMin[i] = std::min(globalMin[i], blockMin[i]);
            globalMax[i] = std::max(globalMax[i], blockMax[i]);
        }
    }

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my understanding, a BlockForest knows the domain size for the whole space, and blocks know their domain range for the entire space. Hence, there is no class that knows the domain range in 1 CPU. Then, I rewrote the code to make it compact.

LatticeWalberla::get_local_domain() const {
using walberla::to_vector3d;
// We only have one block per mpi rank
assert(++(m_blocks->begin()) == m_blocks->end());

auto const ab = m_blocks->begin()->getAABB();
return {to_vector3d(ab.min()), to_vector3d(ab.max())};
// Get upper and lower corner of BlockForest assigned to a mpi rank.
// Since we can allocate multiple blocks per mpi rank,
// the corners of all Blocks are compared.
auto aa = to_vector3d(m_blocks->begin()->getAABB().min());
auto bb = to_vector3d(m_blocks->begin()->getAABB().max());
for (auto b = m_blocks->begin(); b != m_blocks->end(); ++b) {
auto cc = b->getAABB();
for (auto const i : {0u, 1u, 2u}) {
aa[i] = std::min(aa[i], cc.min()[i]);
bb[i] = std::max(bb[i], cc.max()[i]);
}
}
return {aa, bb};
}

[[nodiscard]] bool
Expand Down
Loading
Loading