diff --git a/miniapp/CMakeLists.txt b/miniapp/CMakeLists.txt index 9d5b054928..8f3c145f15 100644 --- a/miniapp/CMakeLists.txt +++ b/miniapp/CMakeLists.txt @@ -39,6 +39,8 @@ DLAF_addMiniapp(miniapp_bt_reduction_to_band SOURCES miniapp_bt_reduction_to_ban DLAF_addMiniapp(miniapp_triangular_solver SOURCES miniapp_triangular_solver.cpp) +DLAF_addMiniapp(miniapp_triangular_multiplication SOURCES miniapp_triangular_multiplication.cpp) + DLAF_addMiniapp(miniapp_eigensolver SOURCES miniapp_eigensolver.cpp) DLAF_addMiniapp(miniapp_gen_eigensolver SOURCES miniapp_gen_eigensolver.cpp) diff --git a/miniapp/miniapp_triangular_multiplication.cpp b/miniapp/miniapp_triangular_multiplication.cpp new file mode 100644 index 0000000000..4cd0e52692 --- /dev/null +++ b/miniapp/miniapp_triangular_multiplication.cpp @@ -0,0 +1,288 @@ +// +// Distributed Linear Algebra with Future (DLAF) +// +// Copyright (c) 2018-2023, ETH Zurich +// All rights reserved. +// +// Please, refer to the LICENSE file in the root directory. +// SPDX-License-Identifier: BSD-3-Clause +// + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +using dlaf::Backend; +using dlaf::DefaultDevice_v; +using dlaf::Device; +using dlaf::GlobalElementIndex; +using dlaf::GlobalElementSize; +using dlaf::SizeType; +using dlaf::TileElementSize; +using dlaf::comm::Communicator; +using dlaf::comm::CommunicatorGrid; +using dlaf::common::Ordering; + +struct Options + : dlaf::miniapp::MiniappOptions { + SizeType m; + SizeType n; + SizeType mb; + SizeType nb; + blas::Side side; + blas::Uplo uplo; + blas::Op op; + blas::Diag diag; + + Options(const pika::program_options::variables_map& vm) + : MiniappOptions(vm), m(vm["m"].as()), n(vm["n"].as()), + mb(vm["mb"].as()), nb(vm["nb"].as()), + side(dlaf::miniapp::parseSide(vm["side"].as())), + uplo(dlaf::miniapp::parseUplo(vm["uplo"].as())), + op(dlaf::miniapp::parseOp(vm["op"].as())), + diag(dlaf::miniapp::parseDiag(vm["diag"].as())) { + DLAF_ASSERT(m > 0 && n > 0, m, n); + DLAF_ASSERT(mb > 0 && nb > 0, mb, nb); + + if (do_check != dlaf::miniapp::CheckIterFreq::None) { + std::cerr << "Warning! At the moment result checking it is not implemented." << std::endl; + do_check = dlaf::miniapp::CheckIterFreq::None; + } + } + + Options(Options&&) = default; + Options(const Options&) = default; + Options& operator=(Options&&) = default; + Options& operator=(const Options&) = default; +}; + +template +using matrix_values_t = std::function; + +template +using linear_system_t = + std::tuple, matrix_values_t, matrix_values_t>; // A, B, X + +template +linear_system_t sampleLeftTr(blas::Uplo uplo, blas::Op op, blas::Diag diag, T alpha, SizeType m); +} + +struct triangularMultiplicationMiniapp { + template + static void run(const Options& opts) { + Communicator world(MPI_COMM_WORLD); + CommunicatorGrid comm_grid(world, opts.grid_rows, opts.grid_cols, Ordering::ColumnMajor); + + // Allocate memory for the matrices + dlaf::matrix::Matrix ah(GlobalElementSize{opts.m, opts.m}, + TileElementSize{opts.mb, opts.mb}, comm_grid); + dlaf::matrix::Matrix bh(GlobalElementSize{opts.m, opts.n}, + TileElementSize{opts.mb, opts.nb}, comm_grid); + + dlaf::matrix::MatrixMirror, Device::CPU> a(ah); + dlaf::matrix::MatrixMirror, Device::CPU> b(bh); + + auto sync_barrier = [&]() { + a.get().waitLocalTiles(); + b.get().waitLocalTiles(); + DLAF_MPI_CHECK_ERROR(MPI_Barrier(world)); + }; + + const auto side = opts.side; + DLAF_ASSERT(side == blas::Side::Left, side); + + const auto uplo = opts.uplo; + const auto op = opts.op; + const auto diag = opts.diag; + const T alpha = 2.0; + + double m = ah.size().rows(); + double n = bh.size().cols(); + auto add_mul = n * m * m / 2; + const double total_ops = dlaf::total_ops(add_mul, add_mul); + + auto [in_op_a, out_b, in_b] = ::sampleLeftTr(uplo, op, diag, alpha, ah.size().rows()); + + for (int64_t run_index = -opts.nwarmups; run_index < opts.nruns; ++run_index) { + if (0 == world.rank() && run_index >= 0) + std::cout << "[" << run_index << "]" << std::endl; + + // setup matrix A and b + using dlaf::matrix::util::set; + set(ah, in_op_a, op); + set(bh, in_b); + a.copySourceToTarget(); + b.copySourceToTarget(); + + sync_barrier(); + + dlaf::common::Timer<> timeit; + if (opts.local) + dlaf::triangular_multiplication, T>(side, uplo, op, diag, + alpha, a.get(), + b.get()); + else + dlaf::triangular_multiplication, T>( + comm_grid, side, uplo, op, diag, alpha, a.get(), b.get()); + + sync_barrier(); + + // benchmark results + if (0 == world.rank() && run_index >= 0) { + auto elapsed_time = timeit.elapsed(); + double gigaflops = total_ops / elapsed_time / 1e9; + + std::cout << "[" << run_index << "]" + << " " << elapsed_time << "s" + << " " << gigaflops << "GFlop/s" + << " " << dlaf::internal::FormatShort{opts.type} + << dlaf::internal::FormatShort{opts.side} << dlaf::internal::FormatShort{opts.uplo} + << dlaf::internal::FormatShort{opts.op} << dlaf::internal::FormatShort{opts.diag} + << " " << bh.size() << " " << bh.blockSize() << " " << comm_grid.size() << " " + << pika::get_os_thread_count() << " " << backend << std::endl; + } + + b.copyTargetToSource(); + + // (optional) run test + if ((opts.do_check == dlaf::miniapp::CheckIterFreq::Last && run_index == (opts.nruns - 1)) || + opts.do_check == dlaf::miniapp::CheckIterFreq::All) { + DLAF_UNIMPLEMENTED("Check"); + } + } + } +}; + +int pika_main(pika::program_options::variables_map& vm) { + pika::scoped_finalize pika_finalizer; + dlaf::ScopedInitializer init(vm); + + const Options opts(vm); + dlaf::miniapp::dispatchMiniapp(opts); + + return EXIT_SUCCESS; +} + +int main(int argc, char** argv) { + dlaf::comm::mpi_init mpi_initter(argc, argv); + + // options + using namespace pika::program_options; + options_description desc_commandline( + "Benchmark computation of A . B, " + "where A is a non-unit lower triangular matrix, and B is an m by n matrix\n\n" + "options\n" + "Usage: miniapp_triangular_multiplication [options]"); + desc_commandline.add(dlaf::miniapp::getMiniappOptionsDescription()); + desc_commandline.add(dlaf::getOptionsDescription()); + + // clang-format off + desc_commandline.add_options() + ("m", value() ->default_value(4096), "Matrix b rows") + ("n", value() ->default_value(512), "Matrix b columns") + ("mb", value() ->default_value(256), "Matrix b block rows") + ("nb", value() ->default_value(512), "Matrix b block columns") + ; + // clang-format on + dlaf::miniapp::addSideOption(desc_commandline); + dlaf::miniapp::addUploOption(desc_commandline); + dlaf::miniapp::addOpOption(desc_commandline); + dlaf::miniapp::addDiagOption(desc_commandline); + + pika::init_params p; + p.desc_cmdline = desc_commandline; + p.rp_callback = dlaf::initResourcePartitionerHandler; + return pika::init(pika_main, argc, argv, p); +} + +namespace { +/// Returns a tuple of element generators of three matrices A(m x m), B (m x n), X (m x n), for which it +/// holds op(A) X = alpha B (alpha can be any value). +/// +/// The elements of op(A) (@p el_op_a) are chosen such that: +/// op(A)_ik = (i+1) / (k+.5) * exp(I*(2*i-k)) for the referenced elements +/// op(A)_ik = -9.9 otherwise, +/// where I = 0 for real types or I is the complex unit for complex types. +/// +/// The elements of X (@p el_x) are computed as +/// X_kj = (k+.5) / (j+2) * exp(I*(k+j)). +/// These data are typically used to check whether the result of the equation +/// performed with any algorithm is consistent with the computed values. +/// +/// Finally, the elements of B (@p el_b) should be: +/// B_ij = (Sum_k op(A)_ik * X_kj) / alpha +/// = (op(A)_ii * X_ij + (kk-1) * gamma) / alpha, +/// where gamma = (i+1) / (j+2) * exp(I*(2*i+j)), +/// kk = i+1 if op(a) is an lower triangular matrix, or +/// kk = m-i if op(a) is an lower triangular matrix. +/// Therefore +/// B_ij = (X_ij + (kk-1) * gamma) / alpha, if diag == Unit +/// B_ij = kk * gamma / alpha, otherwise. +template +linear_system_t sampleLeftTr(blas::Uplo uplo, blas::Op op, blas::Diag diag, T alpha, SizeType m) { + static_assert(std::is_arithmetic_v && !std::is_integral_v, + "it is valid just with floating point values"); + + bool op_a_lower = (uplo == blas::Uplo::Lower && op == blas::Op::NoTrans) || + (uplo == blas::Uplo::Upper && op != blas::Op::NoTrans); + + auto el_op_a = [op_a_lower, diag](const GlobalElementIndex& index) -> T { + if ((op_a_lower && index.row() < index.col()) || (!op_a_lower && index.row() > index.col()) || + (diag == blas::Diag::Unit && index.row() == index.col())) + return static_cast(-9.9); + + const T i = index.row(); + const T k = index.col(); + + return (i + static_cast(1)) / (k + static_cast(.5)); + }; + + auto el_x = [](const GlobalElementIndex& index) -> T { + const T k = index.row(); + const T j = index.col(); + + return (k + static_cast(.5)) / (j + static_cast(2)); + }; + + auto el_b = [m, alpha, diag, op_a_lower, el_x](const GlobalElementIndex& index) -> T { + const dlaf::BaseType kk = op_a_lower ? index.row() + 1 : m - index.row(); + + const T i = index.row(); + const T j = index.col(); + const T gamma = (i + 1) / (j + 2); + if (diag == blas::Diag::Unit) + return ((kk - 1) * gamma + el_x(index)) / alpha; + else + return kk * gamma / alpha; + }; + + return {el_op_a, el_b, el_x}; +} + +} diff --git a/miniapp/miniapp_triangular_solver.cpp b/miniapp/miniapp_triangular_solver.cpp index c5427dd748..9d3478e9ed 100644 --- a/miniapp/miniapp_triangular_solver.cpp +++ b/miniapp/miniapp_triangular_solver.cpp @@ -115,6 +115,7 @@ struct triangularSolverMiniapp { }; const auto side = opts.side; + DLAF_ASSERT(side == blas::Side::Left, side); const auto uplo = opts.uplo; const auto op = opts.op; const auto diag = opts.diag; diff --git a/scripts/miniapps.py b/scripts/miniapps.py index 8706eee9a1..cdc05a397f 100644 --- a/scripts/miniapps.py +++ b/scripts/miniapps.py @@ -278,6 +278,41 @@ def trsm( return cmd, env.strip() +def trmm( + system, + lib, + build_dir, + nodes, + rpn, + m_sz, + n_sz, + mb_sz, + nruns, + suffix="na", + extra_flags="", + env="", + dtype="d", +): + if n_sz == None: + n_sz = m_sz + + _check_ranks_per_node(system, lib, rpn) + [total_ranks, cores_per_rank, threads_per_rank] = _computeResourcesNeededList(system, nodes, rpn) + gr, gc = _sq_factor(total_ranks) + + if lib.startswith("dlaf"): + _check_type(dtype) + env += " OMP_NUM_THREADS=1" + app = f"{build_dir}/miniapp/miniapp_triangular_multiplication" + opts = f"--type {dtype} --m {m_sz} --n {n_sz} --mb {mb_sz} --nb {mb_sz} --grid-rows {gr} --grid-cols {gc} --nruns {nruns} {extra_flags}" + else: + raise ValueError(_err_msg(lib)) + + _checkAppExec(app) + cmd = f"{app} {opts}".strip() + f" >> trmm_{lib}_{suffix}.out 2>&1" + return cmd, env.strip() + + # lib: allowed libraries are dlaf|slate # rpn: ranks per node # diff --git a/scripts/plot_trmm_strong.py b/scripts/plot_trmm_strong.py new file mode 100755 index 0000000000..8df58b05a7 --- /dev/null +++ b/scripts/plot_trmm_strong.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +# +# Distributed Linear Algebra with Future (DLAF) +# +# Copyright (c) 2018-2023, ETH Zurich +# All rights reserved. +# +# Please, refer to the LICENSE file in the root directory. +# SPDX-License-Identifier: BSD-3-Clause +# + +import postprocess as pp + +df = pp.parse_jobs_cmdargs(description="Plot trmm strong scaling benchmarks.") + +df_grp = pp.calc_trmm_metrics(df) +pp.gen_trmm_plots_strong(df_grp) +pp.gen_trmm_plots_strong(df_grp, combine_mb=True) diff --git a/scripts/plot_trmm_weak.py b/scripts/plot_trmm_weak.py new file mode 100755 index 0000000000..0d44009e66 --- /dev/null +++ b/scripts/plot_trmm_weak.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +# +# Distributed Linear Algebra with Future (DLAF) +# +# Copyright (c) 2018-2023, ETH Zurich +# All rights reserved. +# +# Please, refer to the LICENSE file in the root directory. +# SPDX-License-Identifier: BSD-3-Clause +# + +import postprocess as pp + +df = pp.parse_jobs_cmdargs(description="Plot trmm weak scaling benchmarks.") + +df_grp = pp.calc_trmm_metrics(df) +pp.gen_trmm_plots_weak(df_grp, 1024, logx=True) +pp.gen_trmm_plots_weak(df_grp, 1024, logx=True, combine_mb=True) diff --git a/scripts/postprocess.py b/scripts/postprocess.py index 50111fe50e..339aee945f 100644 --- a/scripts/postprocess.py +++ b/scripts/postprocess.py @@ -47,7 +47,7 @@ def _gen_nodes_plot( """ Args: plt_type: ppn | time - plt_routine: chol | hegst | red2band | band2trid | trid_evp | bt_band2trid | bt_red2band | trsm | evp | gevp It is used to filter data. + plt_routine: chol | hegst | red2band | band2trid | trid_evp | bt_band2trid | bt_red2band | trsm | trmm | evp | gevp It is used to filter data. title: title of the plot df: the pandas.DataFrame with the data for the plot combine_mb: bool indicates if different mb has to be included in the same plot @@ -238,7 +238,7 @@ def _parse_line_based(fout, bench_name, nodes): # fail. alg_name = bench_name[0 : bench_name.find("_dlaf")] - if alg_name in ["chol", "hegst", "trsm"]: + if alg_name in ["chol", "hegst", "trsm", "trmm"]: pstr_res = "[{run_index:d}] {time:g}s {perf:g}GFlop/s{matrix_type:optional_text} ({matrix_rows:d}, {matrix_cols:d}) ({block_rows:d}, {block_cols:d}) ({grid_rows:d}, {grid_cols:d}) {:d}{backend:optional_text}" if alg_name in ["red2band", "band2trid", "bt_band2trid", "bt_red2band"]: pstr_res = "[{run_index:d}] {time:g}s {perf:g}GFlop/s{matrix_type:optional_text} ({matrix_rows:d}, {matrix_cols:d}) ({block_rows:d}, {block_cols:d}) {band:d} ({grid_rows:d}, {grid_cols:d}) {:d}{backend:optional_text}" @@ -419,6 +419,10 @@ def calc_trsm_metrics(df): return _calc_metrics(["matrix_rows", "matrix_cols", "block_rows", "nodes", "bench_name"], df) +def calc_trmm_metrics(df): + return _calc_metrics(["matrix_rows", "matrix_cols", "block_rows", "nodes", "bench_name"], df) + + def calc_gen2std_metrics(df): return _calc_metrics(["matrix_rows", "block_rows", "nodes", "bench_name"], df) @@ -483,7 +487,7 @@ def _gen_plot( Args: scaling strong | weak name: name of the routine to be included in the title - routine: chol | hegst | red2band | band2trid | trid_evp | bt_band2trid | trsm | evp | gevp + routine: chol | hegst | red2band | band2trid | trid_evp | bt_band2trid | trsm | trmm | evp | gevp combine_mb: bool indicates if different mb has to be included in the same plot size_type: m | mn It indicates which sizes are relevant. customize_ppn: function accepting the two arguments fig and ax for ppn plot customization @@ -748,6 +752,74 @@ def gen_trsm_plots_weak( ) +def gen_trmm_plots_strong( + df, + logx=False, + combine_mb=False, + filename_suffix=None, + customize_ppn=add_basic_legend, + customize_time=add_basic_legend, + **proxy_args, +): + """ + Args: + customize_ppn: function accepting the two arguments fig and ax for ppn plot customization + customize_time: function accepting the two arguments fig and ax for time plot customization + Default customization (ppn and time): add_basic_legend. They can be set to "None" to remove the legend. + """ + _gen_plot( + scaling="strong", + name="TRMM", + routine="trmm", + filename="trmm", + size_type="mn", + df=df, + logx=logx, + combine_mb=combine_mb, + filename_suffix=filename_suffix, + ppn_plot=True, + customize_ppn=customize_ppn, + time_plot=True, + customize_time=customize_time, + **proxy_args, + ) + + +def gen_trmm_plots_weak( + df, + weak_rt_approx, + logx=False, + combine_mb=False, + filename_suffix=None, + customize_ppn=add_basic_legend, + customize_time=add_basic_legend, + **proxy_args, +): + """ + Args: + customize_ppn: function accepting the two arguments fig and ax for ppn plot customization + customize_time: function accepting the two arguments fig and ax for time plot customization + Default customization (ppn and time): add_basic_legend. They can be set to "None" to remove the legend. + """ + _gen_plot( + scaling="weak", + name="TRMM", + routine="trmm", + filename="trmm", + size_type="mn", + df=df, + logx=logx, + combine_mb=combine_mb, + filename_suffix=filename_suffix, + ppn_plot=True, + customize_ppn=customize_ppn, + time_plot=True, + customize_time=customize_time, + weak_rt_approx=weak_rt_approx, + **proxy_args, + ) + + def gen_gen2std_plots_strong( df, logx=False,