Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add miniapp triangular multiplication #1029

Merged
merged 17 commits into from
Dec 12, 2023
2 changes: 2 additions & 0 deletions miniapp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ DLAF_addMiniapp(miniapp_bt_reduction_to_band SOURCES miniapp_bt_reduction_to_ban

DLAF_addMiniapp(miniapp_triangular_solver SOURCES miniapp_triangular_solver.cpp)

DLAF_addMiniapp(miniapp_triangular_multiplication SOURCES miniapp_triangular_multiplication.cpp)

DLAF_addMiniapp(miniapp_eigensolver SOURCES miniapp_eigensolver.cpp)

DLAF_addMiniapp(miniapp_gen_eigensolver SOURCES miniapp_gen_eigensolver.cpp)
Expand Down
288 changes: 288 additions & 0 deletions miniapp/miniapp_triangular_multiplication.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,288 @@
//
// Distributed Linear Algebra with Future (DLAF)
//
// Copyright (c) 2018-2023, ETH Zurich
// All rights reserved.
//
// Please, refer to the LICENSE file in the root directory.
// SPDX-License-Identifier: BSD-3-Clause
//

#include <cstdlib>
#include <iostream>
#include <limits>
#include <type_traits>

#include <blas/util.hh>
#include <mpi.h>

#include <pika/init.hpp>
#include <pika/program_options.hpp>
#include <pika/runtime.hpp>

#include <dlaf/common/format_short.h>
#include <dlaf/common/index2d.h>
#include <dlaf/common/range2d.h>
#include <dlaf/common/timer.h>
#include <dlaf/communication/communicator_grid.h>
#include <dlaf/communication/init.h>
#include <dlaf/init.h>
#include <dlaf/matrix/copy.h>
#include <dlaf/matrix/index.h>
#include <dlaf/matrix/matrix_mirror.h>
#include <dlaf/miniapp/dispatch.h>
#include <dlaf/miniapp/options.h>
#include <dlaf/multiplication/triangular.h>
#include <dlaf/types.h>
#include <dlaf/util_matrix.h>

namespace {

using dlaf::Backend;
using dlaf::DefaultDevice_v;
using dlaf::Device;
using dlaf::GlobalElementIndex;
using dlaf::GlobalElementSize;
using dlaf::SizeType;
using dlaf::TileElementSize;
using dlaf::comm::Communicator;
using dlaf::comm::CommunicatorGrid;
using dlaf::common::Ordering;

struct Options
: dlaf::miniapp::MiniappOptions<dlaf::miniapp::SupportReal::Yes, dlaf::miniapp::SupportComplex::No> {
SizeType m;
SizeType n;
SizeType mb;
SizeType nb;
blas::Side side;
blas::Uplo uplo;
blas::Op op;
blas::Diag diag;

Options(const pika::program_options::variables_map& vm)
: MiniappOptions(vm), m(vm["m"].as<SizeType>()), n(vm["n"].as<SizeType>()),
mb(vm["mb"].as<SizeType>()), nb(vm["nb"].as<SizeType>()),
side(dlaf::miniapp::parseSide(vm["side"].as<std::string>())),
uplo(dlaf::miniapp::parseUplo(vm["uplo"].as<std::string>())),
op(dlaf::miniapp::parseOp(vm["op"].as<std::string>())),
diag(dlaf::miniapp::parseDiag(vm["diag"].as<std::string>())) {
DLAF_ASSERT(m > 0 && n > 0, m, n);
DLAF_ASSERT(mb > 0 && nb > 0, mb, nb);

if (do_check != dlaf::miniapp::CheckIterFreq::None) {
std::cerr << "Warning! At the moment result checking it is not implemented." << std::endl;
do_check = dlaf::miniapp::CheckIterFreq::None;
}
}

Options(Options&&) = default;
Options(const Options&) = default;
Options& operator=(Options&&) = default;
Options& operator=(const Options&) = default;
};

template <typename T>
using matrix_values_t = std::function<T(const GlobalElementIndex&)>;

template <typename T>
using linear_system_t =
std::tuple<matrix_values_t<T>, matrix_values_t<T>, matrix_values_t<T>>; // A, B, X

template <typename T>
linear_system_t<T> sampleLeftTr(blas::Uplo uplo, blas::Op op, blas::Diag diag, T alpha, SizeType m);
}

struct triangularMultiplicationMiniapp {
template <dlaf::Backend backend, typename T>
static void run(const Options& opts) {
Communicator world(MPI_COMM_WORLD);
CommunicatorGrid comm_grid(world, opts.grid_rows, opts.grid_cols, Ordering::ColumnMajor);

// Allocate memory for the matrices
dlaf::matrix::Matrix<T, Device::CPU> ah(GlobalElementSize{opts.m, opts.m},
TileElementSize{opts.mb, opts.mb}, comm_grid);
dlaf::matrix::Matrix<T, Device::CPU> bh(GlobalElementSize{opts.m, opts.n},
TileElementSize{opts.mb, opts.nb}, comm_grid);

dlaf::matrix::MatrixMirror<T, DefaultDevice_v<backend>, Device::CPU> a(ah);
dlaf::matrix::MatrixMirror<T, DefaultDevice_v<backend>, Device::CPU> b(bh);

auto sync_barrier = [&]() {
a.get().waitLocalTiles();
b.get().waitLocalTiles();
DLAF_MPI_CHECK_ERROR(MPI_Barrier(world));
};

const auto side = opts.side;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to assert the value of this parameter, given that we have sampleLeftTr?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it is needed here and in the triangular solver miniapp as well. Currently the matrix a size is (m x m), so Side::Right fails unless m == n.

DLAF_ASSERT(side == blas::Side::Left, side);

const auto uplo = opts.uplo;
const auto op = opts.op;
const auto diag = opts.diag;
const T alpha = 2.0;

double m = ah.size().rows();
double n = bh.size().cols();
auto add_mul = n * m * m / 2;
const double total_ops = dlaf::total_ops<T>(add_mul, add_mul);

auto [in_op_a, out_b, in_b] = ::sampleLeftTr(uplo, op, diag, alpha, ah.size().rows());

for (int64_t run_index = -opts.nwarmups; run_index < opts.nruns; ++run_index) {
if (0 == world.rank() && run_index >= 0)
std::cout << "[" << run_index << "]" << std::endl;

// setup matrix A and b
using dlaf::matrix::util::set;
set(ah, in_op_a, op);
set(bh, in_b);
a.copySourceToTarget();
b.copySourceToTarget();

sync_barrier();

dlaf::common::Timer<> timeit;
if (opts.local)
dlaf::triangular_multiplication<backend, dlaf::DefaultDevice_v<backend>, T>(side, uplo, op, diag,
alpha, a.get(),
b.get());
else
dlaf::triangular_multiplication<backend, dlaf::DefaultDevice_v<backend>, T>(
comm_grid, side, uplo, op, diag, alpha, a.get(), b.get());

sync_barrier();

// benchmark results
if (0 == world.rank() && run_index >= 0) {
auto elapsed_time = timeit.elapsed();
double gigaflops = total_ops / elapsed_time / 1e9;

std::cout << "[" << run_index << "]"
<< " " << elapsed_time << "s"
<< " " << gigaflops << "GFlop/s"
<< " " << dlaf::internal::FormatShort{opts.type}
<< dlaf::internal::FormatShort{opts.side} << dlaf::internal::FormatShort{opts.uplo}
<< dlaf::internal::FormatShort{opts.op} << dlaf::internal::FormatShort{opts.diag}
<< " " << bh.size() << " " << bh.blockSize() << " " << comm_grid.size() << " "
<< pika::get_os_thread_count() << " " << backend << std::endl;
}

b.copyTargetToSource();

// (optional) run test
if ((opts.do_check == dlaf::miniapp::CheckIterFreq::Last && run_index == (opts.nruns - 1)) ||
opts.do_check == dlaf::miniapp::CheckIterFreq::All) {
DLAF_UNIMPLEMENTED("Check");
}
}
}
};

int pika_main(pika::program_options::variables_map& vm) {
pika::scoped_finalize pika_finalizer;
dlaf::ScopedInitializer init(vm);

const Options opts(vm);
dlaf::miniapp::dispatchMiniapp<triangularMultiplicationMiniapp>(opts);

return EXIT_SUCCESS;
}

int main(int argc, char** argv) {
dlaf::comm::mpi_init mpi_initter(argc, argv);

// options
using namespace pika::program_options;
options_description desc_commandline(
"Benchmark computation of A . B, "
"where A is a non-unit lower triangular matrix, and B is an m by n matrix\n\n"
"options\n"
"Usage: miniapp_triangular_multiplication [options]");
desc_commandline.add(dlaf::miniapp::getMiniappOptionsDescription());
desc_commandline.add(dlaf::getOptionsDescription());

// clang-format off
desc_commandline.add_options()
("m", value<SizeType>() ->default_value(4096), "Matrix b rows")
("n", value<SizeType>() ->default_value(512), "Matrix b columns")
("mb", value<SizeType>() ->default_value(256), "Matrix b block rows")
("nb", value<SizeType>() ->default_value(512), "Matrix b block columns")
;
// clang-format on
dlaf::miniapp::addSideOption(desc_commandline);
dlaf::miniapp::addUploOption(desc_commandline);
dlaf::miniapp::addOpOption(desc_commandline);
dlaf::miniapp::addDiagOption(desc_commandline);

pika::init_params p;
p.desc_cmdline = desc_commandline;
p.rp_callback = dlaf::initResourcePartitionerHandler;
return pika::init(pika_main, argc, argv, p);
}

namespace {
/// Returns a tuple of element generators of three matrices A(m x m), B (m x n), X (m x n), for which it
/// holds op(A) X = alpha B (alpha can be any value).
rasolca marked this conversation as resolved.
Show resolved Hide resolved
///
/// The elements of op(A) (@p el_op_a) are chosen such that:
/// op(A)_ik = (i+1) / (k+.5) * exp(I*(2*i-k)) for the referenced elements
/// op(A)_ik = -9.9 otherwise,
/// where I = 0 for real types or I is the complex unit for complex types.
///
/// The elements of X (@p el_x) are computed as
/// X_kj = (k+.5) / (j+2) * exp(I*(k+j)).
/// These data are typically used to check whether the result of the equation
/// performed with any algorithm is consistent with the computed values.
///
/// Finally, the elements of B (@p el_b) should be:
/// B_ij = (Sum_k op(A)_ik * X_kj) / alpha
/// = (op(A)_ii * X_ij + (kk-1) * gamma) / alpha,
/// where gamma = (i+1) / (j+2) * exp(I*(2*i+j)),
/// kk = i+1 if op(a) is an lower triangular matrix, or
/// kk = m-i if op(a) is an lower triangular matrix.
/// Therefore
/// B_ij = (X_ij + (kk-1) * gamma) / alpha, if diag == Unit
/// B_ij = kk * gamma / alpha, otherwise.
template <typename T>
linear_system_t<T> sampleLeftTr(blas::Uplo uplo, blas::Op op, blas::Diag diag, T alpha, SizeType m) {
Comment on lines +247 to +248
Copy link
Collaborator

@rasolca rasolca Nov 28, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to reviewers: This is code duplication, but it will disappear in favor of random matrices to be able to:

  • re-enable tests
  • allow complex types

static_assert(std::is_arithmetic_v<T> && !std::is_integral_v<T>,
"it is valid just with floating point values");

bool op_a_lower = (uplo == blas::Uplo::Lower && op == blas::Op::NoTrans) ||
(uplo == blas::Uplo::Upper && op != blas::Op::NoTrans);

auto el_op_a = [op_a_lower, diag](const GlobalElementIndex& index) -> T {
if ((op_a_lower && index.row() < index.col()) || (!op_a_lower && index.row() > index.col()) ||
(diag == blas::Diag::Unit && index.row() == index.col()))
return static_cast<T>(-9.9);

const T i = index.row();
const T k = index.col();

return (i + static_cast<T>(1)) / (k + static_cast<T>(.5));
};

auto el_x = [](const GlobalElementIndex& index) -> T {
const T k = index.row();
const T j = index.col();

return (k + static_cast<T>(.5)) / (j + static_cast<T>(2));
};

auto el_b = [m, alpha, diag, op_a_lower, el_x](const GlobalElementIndex& index) -> T {
const dlaf::BaseType<T> kk = op_a_lower ? index.row() + 1 : m - index.row();

const T i = index.row();
const T j = index.col();
const T gamma = (i + 1) / (j + 2);
if (diag == blas::Diag::Unit)
return ((kk - 1) * gamma + el_x(index)) / alpha;
else
return kk * gamma / alpha;
};

return {el_op_a, el_b, el_x};
}

}
1 change: 1 addition & 0 deletions miniapp/miniapp_triangular_solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ struct triangularSolverMiniapp {
};

const auto side = opts.side;
DLAF_ASSERT(side == blas::Side::Left, side);
const auto uplo = opts.uplo;
const auto op = opts.op;
const auto diag = opts.diag;
Expand Down
35 changes: 35 additions & 0 deletions scripts/miniapps.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,41 @@ def trsm(
return cmd, env.strip()


def trmm(
system,
lib,
build_dir,
nodes,
rpn,
m_sz,
n_sz,
mb_sz,
nruns,
suffix="na",
extra_flags="",
env="",
dtype="d",
):
if n_sz == None:
n_sz = m_sz

_check_ranks_per_node(system, lib, rpn)
[total_ranks, cores_per_rank, threads_per_rank] = _computeResourcesNeededList(system, nodes, rpn)
gr, gc = _sq_factor(total_ranks)

if lib.startswith("dlaf"):
_check_type(dtype)
env += " OMP_NUM_THREADS=1"
app = f"{build_dir}/miniapp/miniapp_triangular_multiplication"
opts = f"--type {dtype} --m {m_sz} --n {n_sz} --mb {mb_sz} --nb {mb_sz} --grid-rows {gr} --grid-cols {gc} --nruns {nruns} {extra_flags}"
else:
raise ValueError(_err_msg(lib))

_checkAppExec(app)
cmd = f"{app} {opts}".strip() + f" >> trmm_{lib}_{suffix}.out 2>&1"
return cmd, env.strip()


# lib: allowed libraries are dlaf|slate
# rpn: ranks per node
#
Expand Down
20 changes: 20 additions & 0 deletions scripts/plot_trmm_strong.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/env python3
# coding: utf-8

#
# Distributed Linear Algebra with Future (DLAF)
#
# Copyright (c) 2018-2023, ETH Zurich
# All rights reserved.
#
# Please, refer to the LICENSE file in the root directory.
# SPDX-License-Identifier: BSD-3-Clause
#

import postprocess as pp

df = pp.parse_jobs_cmdargs(description="Plot trmm strong scaling benchmarks.")

df_grp = pp.calc_trmm_metrics(df)
pp.gen_trmm_plots_strong(df_grp)
pp.gen_trmm_plots_strong(df_grp, combine_mb=True)
20 changes: 20 additions & 0 deletions scripts/plot_trmm_weak.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/env python3
# coding: utf-8

#
# Distributed Linear Algebra with Future (DLAF)
#
# Copyright (c) 2018-2023, ETH Zurich
# All rights reserved.
#
# Please, refer to the LICENSE file in the root directory.
# SPDX-License-Identifier: BSD-3-Clause
#

import postprocess as pp

df = pp.parse_jobs_cmdargs(description="Plot trmm weak scaling benchmarks.")

df_grp = pp.calc_trmm_metrics(df)
pp.gen_trmm_plots_weak(df_grp, 1024, logx=True)
pp.gen_trmm_plots_weak(df_grp, 1024, logx=True, combine_mb=True)
Loading
Loading