Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Instrument distributed eigensolver with HDF5 #1016

Merged
merged 32 commits into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
45bfb8f
fix conflict
RMeli Mar 25, 2024
ea67395
enable multiple calls
RMeli Nov 8, 2023
8b370d2
fix conflicts
RMeli Mar 25, 2024
5f1e2db
fixes
RMeli Dec 1, 2023
35960e1
formatting
RMeli Dec 1, 2023
5c7220d
fix conflict
RMeli Mar 25, 2024
e8c9e85
fix name
RMeli Mar 25, 2024
fd438ff
fix ref
RMeli Mar 25, 2024
2c62631
Update include/dlaf/eigensolver/band_to_tridiag/mc.h
RMeli Apr 8, 2024
709c5b0
add printout when nrefls==0
RMeli Apr 11, 2024
b4bdd4b
fix
RMeli Apr 11, 2024
295d826
Merge branch 'hdf5' of https://github.com/RMeli/DLA-Future into hdf5
RMeli Apr 11, 2024
822d1ac
fix conflict
RMeli Mar 25, 2024
4a2f301
enable multiple calls
RMeli Nov 8, 2023
4ba3e3e
fix conflicts
RMeli Mar 25, 2024
338e51e
fixes
RMeli Dec 1, 2023
2c265fa
formatting
RMeli Dec 1, 2023
3785c2e
fix conflict
RMeli Mar 25, 2024
01114f2
fix name
RMeli Mar 25, 2024
4865bc1
fix ref
RMeli Mar 25, 2024
73bbe3d
add printout when nrefls==0
RMeli Apr 11, 2024
a46c984
fix
RMeli Apr 11, 2024
84bbca7
Update include/dlaf/eigensolver/band_to_tridiag/mc.h
RMeli Apr 8, 2024
ecb8fed
Merge branch 'hdf5' of https://github.com/RMeli/DLA-Future into hdf5
RMeli Apr 23, 2024
5d90c9c
remove old comments
RMeli Apr 23, 2024
57f47dc
add type in file name and make counters atomic
RMeli Apr 23, 2024
112b615
update
RMeli Apr 25, 2024
9eb3b27
format
RMeli Apr 25, 2024
ca00c87
add missing changes, remove unrelated changes
RMeli Apr 25, 2024
f3fe0f1
add back string
RMeli Apr 25, 2024
315f95a
Apply suggestions from code review
RMeli Apr 26, 2024
68ab441
Merge branch 'master' into hdf5
rasolca May 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

cmake_minimum_required(VERSION 3.22)

project(DLAF VERSION 0.4.0)
project(DLAF VERSION 0.1.0)

# ---------------------------------------------------------------------------
# CMake configurations
Expand Down
2 changes: 1 addition & 1 deletion ci/common-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ stages:
reports:
dotenv: build.env
variables:
SPACK_SHA: 4b0479159feed0d9bcd6f1d75b166a6ac67f9602
SPACK_SHA: dcc4423a9d0219a26ceeb52e329ed956e41f4d2c
SPACK_DLAF_REPO: ./spack
DOCKER_BUILD_ARGS: '[
"BASE_IMAGE",
Expand Down
25 changes: 25 additions & 0 deletions include/dlaf/eigensolver/band_to_tridiag/mc.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@

#pragma once

#include <atomic>
#include <sstream>

#include <pika/execution.hpp>
#include <pika/semaphore.hpp>

Expand All @@ -29,6 +32,7 @@
#include <dlaf/lapack/gpu/laset.h>
#include <dlaf/lapack/tile.h>
#include <dlaf/matrix/copy_tile.h>
#include <dlaf/matrix/hdf5.h>
#include <dlaf/matrix/matrix.h>
#include <dlaf/matrix/tile.h>
#include <dlaf/memory/memory_view.h>
Expand Down Expand Up @@ -1034,6 +1038,19 @@ TridiagResult<T, Device::CPU> BandToTridiag<Backend::MC, D, T>::call_L(
// Should be dispatched to local implementation if (1x1) grid.
DLAF_ASSERT(grid.size() != comm::Size2D(1, 1), grid);

#ifdef DLAF_WITH_HDF5
static std::atomic<size_t> num_b2t_calls = 0;
std::stringstream fname;
fname << "band_to_tridiag-" << matrix::internal::TypeToString_v<T> << "-"
<< std::to_string(num_b2t_calls) << ".h5";
std::optional<matrix::internal::FileHDF5> file;

if (getTuneParameters().debug_dump_band_to_tridiagonal_data) {
file = matrix::internal::FileHDF5(grid.fullCommunicator(), fname.str());
file->write(mat_a, "/input");
}
#endif

// note: A is square and has square blocksize
SizeType size = mat_a.size().cols();
SizeType n = mat_a.nrTiles().cols();
Expand Down Expand Up @@ -1524,6 +1541,14 @@ TridiagResult<T, Device::CPU> BandToTridiag<Backend::MC, D, T>::call_L(
mat_trid.readwrite(index)));
}

#ifdef DLAF_WITH_HDF5
if (getTuneParameters().debug_dump_band_to_tridiagonal_data) {
file->write(mat_trid, "/tridiagonal");
}

num_b2t_calls++;
#endif

return {std::move(mat_trid), std::move(mat_v)};
}
}
29 changes: 19 additions & 10 deletions include/dlaf/eigensolver/eigensolver/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@
//
#pragma once

#include <atomic>
#include <cmath>
#include <optional>
#include <sstream>
#include <vector>

#include <dlaf/blas/tile.h>
Expand Down Expand Up @@ -60,28 +62,35 @@ void Eigensolver<B, D, T>::call(comm::CommunicatorGrid& grid, blas::Uplo uplo, M
if (uplo != blas::Uplo::Lower)
DLAF_UNIMPLEMENTED(uplo);

auto mat_taus = reduction_to_band<B>(grid, mat_a, band_size);
auto ret = band_to_tridiagonal<Backend::MC>(grid, uplo, band_size, mat_a);

#ifdef DLAF_WITH_HDF5
static std::atomic<size_t> num_eigensolver_calls = 0;
std::stringstream fname;
fname << "eigensolver-" << matrix::internal::TypeToString_v<T> << "-"
<< std::to_string(num_eigensolver_calls) << ".h5";
std::optional<matrix::internal::FileHDF5> file;

if (getTuneParameters().debug_dump_trisolver_data) {
file = matrix::internal::FileHDF5(grid.fullCommunicator(), "trid-ref.h5");
file->write(ret.tridiagonal, "/tridiag");
if (getTuneParameters().debug_dump_eigensolver_data) {
file = matrix::internal::FileHDF5(grid.fullCommunicator(), fname.str());
file->write(mat_a, "/input");
}
#endif

auto mat_taus = reduction_to_band<B>(grid, mat_a, band_size);

auto ret = band_to_tridiagonal<Backend::MC>(grid, uplo, band_size, mat_a);

tridiagonal_eigensolver<B>(grid, ret.tridiagonal, evals, mat_e);

bt_band_to_tridiagonal<B>(grid, band_size, mat_e, ret.hh_reflectors);
bt_reduction_to_band<B>(grid, band_size, mat_e, mat_a, mat_taus);

#ifdef DLAF_WITH_HDF5
if (getTuneParameters().debug_dump_trisolver_data) {
if (getTuneParameters().debug_dump_eigensolver_data) {
file->write(evals, "/evals");
file->write(mat_e, "/evecs");
}
#endif

bt_band_to_tridiagonal<B>(grid, band_size, mat_e, ret.hh_reflectors);
bt_reduction_to_band<B>(grid, band_size, mat_e, mat_a, mat_taus);
num_eigensolver_calls++;
#endif
}
}
35 changes: 34 additions & 1 deletion include/dlaf/eigensolver/reduction_to_band/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@
//
#pragma once

#include <atomic>
#include <cmath>
#include <cstddef>
#include <sstream>
#include <vector>

#include <pika/barrier.hpp>
Expand Down Expand Up @@ -41,6 +43,7 @@
#include <dlaf/lapack/tile.h>
#include <dlaf/matrix/copy_tile.h>
#include <dlaf/matrix/distribution.h>
#include <dlaf/matrix/hdf5.h>
#include <dlaf/matrix/index.h>
#include <dlaf/matrix/matrix.h>
#include <dlaf/matrix/panel.h>
Expand Down Expand Up @@ -1129,6 +1132,19 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
auto mpi_col_chain = grid.col_communicator_pipeline();
auto mpi_col_chain_panel = grid.col_communicator_pipeline();

#ifdef DLAF_WITH_HDF5
static std::atomic<size_t> num_reduction_to_band_calls = 0;
std::stringstream fname;
fname << "reduction_to_band-" << matrix::internal::TypeToString_v<T> << "-"
<< std::to_string(num_reduction_to_band_calls) << ".h5";
std::optional<matrix::internal::FileHDF5> file;

if (getTuneParameters().debug_dump_reduction_to_band_data) {
file = matrix::internal::FileHDF5(grid.fullCommunicator(), fname.str());
file->write(mat_a, "/input");
}
#endif

const auto& dist = mat_a.distribution();
const comm::Index2D rank = dist.rankIndex();

Expand All @@ -1144,8 +1160,17 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
comm::Index2D(mat_a.rankIndex().col(), 0),
comm::Index2D(mat_a.sourceRankIndex().col(), 0)));

if (nrefls == 0)
if (nrefls == 0) {
#ifdef DLAF_WITH_HDF5
if (getTuneParameters().debug_dump_reduction_to_band_data) {
file->write(mat_a, "/band");
}

num_reduction_to_band_calls++;
#endif

return mat_taus;
}

Matrix<T, Device::CPU> mat_taus_retiled =
mat_taus.retiledSubPipeline(LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
Expand Down Expand Up @@ -1423,6 +1448,14 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
v.reset();
}

#ifdef DLAF_WITH_HDF5
if (getTuneParameters().debug_dump_reduction_to_band_data) {
file->write(mat_a, "/band");
}

num_reduction_to_band_calls++;
#endif

return mat_taus;
}
}
25 changes: 25 additions & 0 deletions include/dlaf/eigensolver/tridiag_solver/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#pragma once

#include <algorithm>
#include <atomic>
#include <sstream>

#include <pika/execution.hpp>
#include <pika/thread.hpp>
Expand All @@ -27,6 +29,7 @@
#include <dlaf/eigensolver/tridiag_solver/merge.h>
#include <dlaf/lapack/tile.h>
#include <dlaf/matrix/copy_tile.h>
#include <dlaf/matrix/hdf5.h>
#include <dlaf/permutations/general.h>
#include <dlaf/permutations/general/impl.h>
#include <dlaf/sender/make_sender_algorithm_overloads.h>
Expand Down Expand Up @@ -363,6 +366,19 @@ void TridiagSolver<B, D, T>::call(comm::CommunicatorGrid& grid, Matrix<T, Device
if (evecs.size().isEmpty())
return;

#ifdef DLAF_WITH_HDF5
static std::atomic<size_t> num_tridiag_solver_calls = 0;
std::stringstream fname;
fname << "tridiag_solver-"
<< matrix::internal::TypeToString_v<T> << std::to_string(num_tridiag_solver_calls) << ".h5";
std::optional<matrix::internal::FileHDF5> file;

if (getTuneParameters().debug_dump_tridiag_solver_data) {
file = matrix::internal::FileHDF5(grid.fullCommunicator(), fname.str());
file->write(tridiag, "/input");
}
#endif

// If the matrix is composed by a single tile simply call stedc.
if (evecs.nrTiles().linear_size() == 1) {
if constexpr (D == Device::CPU) {
Expand Down Expand Up @@ -441,6 +457,15 @@ void TridiagSolver<B, D, T>::call(comm::CommunicatorGrid& grid, Matrix<T, Device
dlaf::permutations::permute<Backend::MC, Device::CPU, T, Coord::Col>(row_task_chain, 0, n, ws_h.i1,
ws_hm.e0, ws_hm.e2);
copy(ws_hm.e2, evecs);

#ifdef DLAF_WITH_HDF5
if (getTuneParameters().debug_dump_tridiag_solver_data) {
file->write(evecs, "/evecs");
file->write(evals, "/evals");
}

num_tridiag_solver_calls++;
#endif
}

// \overload TridiagSolver<B, D, T>::call()
Expand Down
31 changes: 31 additions & 0 deletions include/dlaf/matrix/hdf5.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#include <complex>
#include <cstdint>
#include <string>
#include <string_view>
RMeli marked this conversation as resolved.
Show resolved Hide resolved
#include <typeinfo>

#include <H5Cpp.h>
#include <mpi.h>
Expand Down Expand Up @@ -53,6 +55,35 @@ const H5::PredType& hdf5_datatype<std::complex<T>>::type = hdf5_datatype<T>::typ
template <class T>
struct hdf5_datatype<const T> : public hdf5_datatype<T> {};

// Type to string mappings
template <typename T>
struct TypeToString {
static inline constexpr std::string_view value = typeid(T).name();
};

template <typename T>
inline constexpr std::string_view TypeToString_v = TypeToString<T>::value;

template <>
struct TypeToString<float> {
static inline constexpr std::string_view value = "s";
};

template <>
struct TypeToString<double> {
static inline constexpr std::string_view value = "d";
};

template <>
struct TypeToString<std::complex<float>> {
static inline constexpr std::string_view value = "c";
};

template <>
struct TypeToString<std::complex<double>> {
static inline constexpr std::string_view value = "z";
};

// Helper function that for each local tile index in @p dist, gets a sender of a tile with
// @p get_tile and sends it to a function that takes care of the mapping between file and memory.
// Then, this function, passes all required arguments to @p dataset_op which should be either
Expand Down
25 changes: 20 additions & 5 deletions include/dlaf/tune.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,23 @@ namespace dlaf {
/// DLA-Future tuning parameters.
///
/// Holds the value of the parameters that can be used to tune DLA-Future.
/// - debug_dump_trisolver_data:
/// Enable dump of trisolver input/output data to "trid-ref.h5" file that will be created in the
/// - debug_dump_eigensolver_data:
/// Enable dump of eigensolver input/output data to "eigensolver.h5" file that will be created in the
/// working folder (it should not exist before the execution).
/// WARNING: just a single execution can be dumped on disk, and any subsequent call fails.
/// Set with environment variable DLAF_DEBUG_DUMP_TRISOLVER_DATA.
/// Set with environment variable DLAF_DEBUG_DUMP_EIGENSOLVER_DATA.
/// - debug_dump_reduction_to_band_data:
/// Enable dump of reduction_to_band input/output data to "reduction_to_band.h5" file that will be
/// created in the working folder (it should not exist before the execution).
/// environment variable
/// DLAF_DEBUG_DUMP_EIGENSOLVER_DATA.
/// - debug_dump_band_to_tridiagonal_data:
/// Enable dump of band_to_trigiagonal input/output data to "band_to_tridiagonal.h5" file that will
/// be created in the working folder (it should not exist before the execution).
/// environment variable DLAF_DEBUG_DUMP_BAND_TO_TRIDIAGONAL_DATA.
/// - debug_dump_tridiag_solver_data:
/// Enable dump of tridiagonal solver input/output data to "tridiagonal.h5" file that will before
/// created in the working folder (it should not exist before the execution).
/// Set with environment variable DLAF_DEBUG_DUMP_TRIDIAG_SOLVER_DATA.
/// - red2band_panel_nworkers:
/// The maximum number of threads to use for computing the panel in the reduction to band algorithm.
/// Set with --dlaf:red2band-panel-nworkers or env variable DLAF_RED2BAND_PANEL_NWORKERS.
Expand Down Expand Up @@ -81,7 +93,10 @@ struct TuneParameters {
red2band_panel_nworkers = std::max<std::size_t>(1, default_pool_thread_count / 2);
tridiag_rank1_nworkers = default_pool_thread_count;
}
bool debug_dump_trisolver_data = false;
bool debug_dump_eigensolver_data = false;
bool debug_dump_reduction_to_band_data = false;
bool debug_dump_band_to_tridiagonal_data = false;
bool debug_dump_tridiag_solver_data = false;
std::size_t red2band_panel_nworkers = 1;
std::size_t red2band_barrier_busy_wait_us = 1000;
std::size_t tridiag_rank1_nworkers = 1;
Expand Down
5 changes: 0 additions & 5 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -353,11 +353,6 @@ configure_package_config_file(
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CMAKE_PROJECT_NAME}
)

write_basic_package_version_file(
DLAFConfigVersion.cmake VERSION ${PACKAGE_VERSION} COMPATIBILITY AnyNewerVersion
)

install(FILES ${CMAKE_CURRENT_BINARY_DIR}/DLAFConfig.cmake
${CMAKE_CURRENT_BINARY_DIR}/DLAFConfigVersion.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CMAKE_PROJECT_NAME}
)
8 changes: 7 additions & 1 deletion src/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,13 @@ void updateConfiguration(const pika::program_options::variables_map& vm, configu
updateConfigurationValue(vm, param.band_to_tridiag_1d_block_size_base,
"BAND_TO_TRIDIAG_1D_BLOCK_SIZE_BASE", "band-to-tridiag-1d-block-size-base");

updateConfigurationValue(vm, param.debug_dump_trisolver_data, "DEBUG_DUMP_TRISOLVER_DATA", "");
updateConfigurationValue(vm, param.debug_dump_eigensolver_data, "DEBUG_DUMP_EIGENSOLVER_DATA", "");
updateConfigurationValue(vm, param.debug_dump_reduction_to_band_data,
"DEBUG_DUMP_REDUCTION_TO_BAND_DATA", "");
updateConfigurationValue(vm, param.debug_dump_band_to_tridiagonal_data,
"DEBUG_DUMP_BAND_TO_TRIDIAGONAL_DATA", "");
updateConfigurationValue(vm, param.debug_dump_tridiag_solver_data, "DEBUG_DUMP_TRIDIAG_SOLVER_DATA",
"");

updateConfigurationValue(vm, param.tridiag_rank1_nworkers, "TRIDIAG_RANK1_NWORKERS",
"tridiag-rank1-nworkers");
Expand Down
Loading