Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor RetiledMatrix into Matrix #935

Merged
merged 5 commits into from
Jul 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions include/dlaf/auxiliary/norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,17 @@ namespace dlaf::auxiliary {
///
/// @pre `A.blockSize().rows() == A.blockSize().cols()`,
/// @pre @p A is distributed according to @p grid,
/// @pre @p A has equal tile and block sizes,
/// @return the norm @p norm_type of the Matrix @p A or 0 if `A.size().isEmpty()` (see LAPACK doc for
/// additional info).
template <Backend backend, Device device, class T>
dlaf::BaseType<T> norm(comm::CommunicatorGrid grid, comm::Index2D rank, lapack::Norm norm_type,
blas::Uplo uplo, Matrix<const T, device>& A) {
using dlaf::matrix::equal_process_grid;
using dlaf::matrix::single_tile_per_block;

DLAF_ASSERT(equal_process_grid(A, grid), A, grid);
DLAF_ASSERT(single_tile_per_block(A), A);

// LAPACK documentation specify that if any dimension is 0, the result is 0
if (A.size().isEmpty())
Expand Down
8 changes: 6 additions & 2 deletions include/dlaf/eigensolver/band_to_tridiag.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,16 @@ namespace eigensolver {
/// @pre mat_a has a square size,
/// @pre mat_a has a square block size,
/// @pre band_size is a divisor of mat_a.blockSize().cols(), and band_size >= 2
/// @pre mat_a is not distributed.
/// @pre mat_a is not distributed,
/// @pre mat_a has equal tile and block sizes.
template <Backend B, Device D, class T>
TridiagResult<T, Device::CPU> bandToTridiag(blas::Uplo uplo, SizeType band_size,
Matrix<const T, D>& mat_a) {
DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
DLAF_ASSERT(matrix::square_blocksize(mat_a), mat_a);
DLAF_ASSERT(mat_a.blockSize().rows() % band_size == 0, mat_a.blockSize().rows(), band_size);
DLAF_ASSERT(matrix::local_matrix(mat_a), mat_a);
DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);
DLAF_ASSERT(band_size >= 2, band_size);

switch (uplo) {
Expand Down Expand Up @@ -140,13 +142,15 @@ TridiagResult<T, Device::CPU> bandToTridiag(blas::Uplo uplo, SizeType band_size,
/// @pre mat_a has a square size,
/// @pre mat_a has a square block size,
/// @pre band_size is a divisor of mat_a.blockSize().cols() and band_size >= 2,
/// @pre mat_a is distributed according to grid.
/// @pre mat_a is distributed according to grid,
/// @pre mat_a has equal tile and block sizes.
template <Backend backend, Device device, class T>
TridiagResult<T, Device::CPU> bandToTridiag(comm::CommunicatorGrid grid, blas::Uplo uplo,
SizeType band_size, Matrix<const T, device>& mat_a) {
DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
DLAF_ASSERT(matrix::square_blocksize(mat_a), mat_a);
DLAF_ASSERT(matrix::equal_process_grid(mat_a, grid), mat_a, grid);
DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);
DLAF_ASSERT(band_size >= 2, band_size);

// If the grid contains only one rank force local implementation.
Expand Down
5 changes: 5 additions & 0 deletions include/dlaf/eigensolver/bt_band_to_tridiag.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ namespace dlaf::eigensolver {
// @pre band_size is a divisor of mat_hh.blockSize().cols()
// @pre mat_e is not distributed
// @pre mat_hh is not distributed
// @pre mat_e has equal tile and block sizes
// @pre mat_hh has equal tile and block sizes
template <Backend B, Device D, class T>
void backTransformationBandToTridiag(const SizeType band_size, matrix::Matrix<T, D>& mat_e,
matrix::Matrix<const T, Device::CPU>& mat_hh) {
Expand All @@ -63,6 +65,9 @@ void backTransformationBandToTridiag(const SizeType band_size, matrix::Matrix<T,
DLAF_ASSERT(mat_hh.size().rows() == mat_e.size().rows(), mat_hh, mat_e);
DLAF_ASSERT(mat_hh.blockSize().rows() == mat_e.blockSize().rows(), mat_hh, mat_e);

DLAF_ASSERT(matrix::single_tile_per_block(mat_e), mat_e);
DLAF_ASSERT(matrix::single_tile_per_block(mat_hh), mat_hh);

DLAF_ASSERT(band_size >= 2, band_size);
DLAF_ASSERT(mat_hh.blockSize().rows() % band_size == 0, mat_hh.blockSize(), band_size);

Expand Down
5 changes: 2 additions & 3 deletions include/dlaf/eigensolver/bt_band_to_tridiag/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
#include <dlaf/matrix/index.h>
#include <dlaf/matrix/matrix.h>
#include <dlaf/matrix/panel.h>
#include <dlaf/matrix/retiled_matrix.h>
#include <dlaf/matrix/tile.h>
#include <dlaf/sender/policy.h>
#include <dlaf/sender/traits.h>
Expand Down Expand Up @@ -621,7 +620,7 @@ void BackTransformationT2B<B, D, T>::call(const SizeType band_size, Matrix<T, D>
const SizeType nsweeps = nrSweeps<T>(mat_hh.size().cols());

const LocalTileSize tiles_per_block(mat_e.blockSize().rows() / b, 1);
matrix::RetiledMatrix<T, D> mat_e_rt(mat_e, tiles_per_block);
Matrix<T, D> mat_e_rt = mat_e.retiledSubPipeline(tiles_per_block);

const auto& dist_hh = mat_hh.distribution();
const auto& dist_e_rt = mat_e_rt.distribution();
Expand Down Expand Up @@ -746,7 +745,7 @@ void BackTransformationT2B<B, D, T>::call(comm::CommunicatorGrid grid, const Siz
const SizeType group_size = getTuneParameters().bt_band_to_tridiag_hh_apply_group_size;

const LocalTileSize tiles_per_block(mat_e.blockSize().rows() / b, 1);
matrix::RetiledMatrix<T, D> mat_e_rt(mat_e, tiles_per_block);
Matrix<T, D> mat_e_rt = mat_e.retiledSubPipeline(tiles_per_block);

const auto& dist_hh = mat_hh.distribution();
const auto& dist_e_rt = mat_e_rt.distribution();
Expand Down
12 changes: 10 additions & 2 deletions include/dlaf/eigensolver/bt_reduction_to_band.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ namespace eigensolver {
/// @param mat_taus is the tau vector as returned by reductionToBand. The j-th element is the scaling
/// factor for the j-th HH tranformation.
/// @pre mat_c is not distributed,
/// @pre mat_v is not distributed.
/// @pre mat_v is not distributed,
/// @pre mat_c has equal tile and block sizes,
/// @pre mat_v has equal tile and block sizes.
template <Backend backend, Device device, class T>
void backTransformationReductionToBand(const SizeType b, Matrix<T, device>& mat_c,
Matrix<const T, device>& mat_v,
Expand All @@ -45,6 +47,8 @@ void backTransformationReductionToBand(const SizeType b, Matrix<T, device>& mat_
DLAF_ASSERT(square_blocksize(mat_v), mat_v);
DLAF_ASSERT(mat_c.size().rows() == mat_v.size().rows(), mat_c, mat_v);
DLAF_ASSERT(mat_c.blockSize().rows() == mat_v.blockSize().rows(), mat_c, mat_v);
DLAF_ASSERT(single_tile_per_block(mat_c), mat_c);
DLAF_ASSERT(single_tile_per_block(mat_v), mat_v);

[[maybe_unused]] auto nr_reflectors_blocks = [&b, &mat_v]() {
const SizeType m = mat_v.size().rows();
Expand All @@ -68,7 +72,9 @@ void backTransformationReductionToBand(const SizeType b, Matrix<T, device>& mat_
/// @param mat_taus is the tau vector as returned by reductionToBand. The j-th element is the scaling
/// factor for the j-th HH tranformation.
/// @pre mat_c is distributed,
/// @pre mat_v is distributed according to grid.
/// @pre mat_v is distributed according to grid,
/// @pre mat_c has equal tile and block sizes,
/// @pre mat_v has equal tile and block sizes.
template <Backend backend, Device device, class T>
void backTransformationReductionToBand(comm::CommunicatorGrid grid, const SizeType b,
Matrix<T, device>& mat_c, Matrix<const T, device>& mat_v,
Expand All @@ -79,6 +85,8 @@ void backTransformationReductionToBand(comm::CommunicatorGrid grid, const SizeTy
DLAF_ASSERT(square_blocksize(mat_v), mat_v);
DLAF_ASSERT(mat_c.size().rows() == mat_v.size().rows(), mat_c, mat_v);
DLAF_ASSERT(mat_c.blockSize().rows() == mat_v.blockSize().rows(), mat_c, mat_v);
DLAF_ASSERT(single_tile_per_block(mat_c), mat_c);
DLAF_ASSERT(single_tile_per_block(mat_v), mat_v);

[[maybe_unused]] auto nr_reflectors_blocks = [&b, &mat_v]() {
const SizeType m = mat_v.size().rows();
Expand Down
6 changes: 6 additions & 0 deletions include/dlaf/eigensolver/eigensolver.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ void eigensolver(blas::Uplo uplo, Matrix<T, D>& mat, Matrix<BaseType<T>, D>& eig
DLAF_ASSERT(square_blocksize(eigenvectors), eigenvectors);
DLAF_ASSERT(eigenvectors.size() == mat.size(), eigenvectors, mat);
DLAF_ASSERT(eigenvectors.blockSize() == mat.blockSize(), eigenvectors, mat);
DLAF_ASSERT(single_tile_per_block(mat), mat);
DLAF_ASSERT(single_tile_per_block(eigenvalues), eigenvalues);
DLAF_ASSERT(single_tile_per_block(eigenvectors), eigenvectors);

internal::Eigensolver<B, D, T>::call(uplo, mat, eigenvalues, eigenvectors);
}
Expand Down Expand Up @@ -107,6 +110,9 @@ void eigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, D>& mat
DLAF_ASSERT(square_blocksize(eigenvectors), eigenvectors);
DLAF_ASSERT(eigenvectors.size() == mat.size(), eigenvectors, mat);
DLAF_ASSERT(eigenvectors.blockSize() == mat.blockSize(), eigenvectors, mat);
DLAF_ASSERT(single_tile_per_block(mat), mat);
DLAF_ASSERT(single_tile_per_block(eigenvalues), eigenvalues);
DLAF_ASSERT(single_tile_per_block(eigenvectors), eigenvectors);

internal::Eigensolver<B, D, T>::call(grid, uplo, mat, eigenvalues, eigenvectors);
}
Expand Down
8 changes: 8 additions & 0 deletions include/dlaf/eigensolver/gen_eigensolver.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ void genEigensolver(blas::Uplo uplo, Matrix<T, D>& mat_a, Matrix<T, D>& mat_b,
eigenvectors);
DLAF_ASSERT(eigenvectors.size() == mat_a.size(), eigenvectors, mat_a);
DLAF_ASSERT(eigenvectors.blockSize() == mat_a.blockSize(), eigenvectors, mat_a);
DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);
DLAF_ASSERT(matrix::single_tile_per_block(mat_b), mat_b);
DLAF_ASSERT(matrix::single_tile_per_block(eigenvalues), eigenvalues);
DLAF_ASSERT(matrix::single_tile_per_block(eigenvectors), eigenvectors);

internal::GenEigensolver<B, D, T>::call(uplo, mat_a, mat_b, eigenvalues, eigenvectors);
}
Expand Down Expand Up @@ -139,6 +143,10 @@ void genEigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, D>&
eigenvectors);
DLAF_ASSERT(eigenvectors.size() == mat_a.size(), eigenvectors, mat_a);
DLAF_ASSERT(eigenvectors.blockSize() == mat_a.blockSize(), eigenvectors, mat_a);
DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);
DLAF_ASSERT(matrix::single_tile_per_block(mat_b), mat_b);
DLAF_ASSERT(matrix::single_tile_per_block(eigenvalues), eigenvalues);
DLAF_ASSERT(matrix::single_tile_per_block(eigenvectors), eigenvectors);

internal::GenEigensolver<B, D, T>::call(grid, uplo, mat_a, mat_b, eigenvalues, eigenvectors);
}
Expand Down
6 changes: 6 additions & 0 deletions include/dlaf/eigensolver/gen_to_std.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ namespace eigensolver {
/// Note: B should be modifiable as the diagonal tiles might be temporarly modified during the calculation.
/// @pre mat_a and mat_b have the same square size,
/// @pre mat_a and mat_b have the same square block size,
/// @pre mat_a and mat_b have the same tile and block sizes,
/// @pre mat_a and mat_b are not distributed.
template <Backend backend, Device device, class T>
void genToStd(blas::Uplo uplo, Matrix<T, device>& mat_a, Matrix<T, device>& mat_b) {
Expand All @@ -47,6 +48,8 @@ void genToStd(blas::Uplo uplo, Matrix<T, device>& mat_a, Matrix<T, device>& mat_
DLAF_ASSERT(matrix::square_blocksize(mat_b), mat_b);
DLAF_ASSERT(mat_a.size() == mat_b.size(), mat_a, mat_b);
DLAF_ASSERT(mat_a.blockSize() == mat_b.blockSize(), mat_a, mat_b);
DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);
DLAF_ASSERT(matrix::single_tile_per_block(mat_b), mat_b);
DLAF_ASSERT(matrix::local_matrix(mat_a), mat_a);
DLAF_ASSERT(matrix::local_matrix(mat_b), mat_b);

Expand Down Expand Up @@ -80,6 +83,7 @@ void genToStd(blas::Uplo uplo, Matrix<T, device>& mat_a, Matrix<T, device>& mat_
/// Note: B should be modifiable as the diagonal tiles might be temporarly modified during the calculation.
/// @pre mat_a and mat_b have the same square size,
/// @pre mat_a and mat_b have the same square block size,
/// @pre mat_a and mat_b have the same tile and block sizes,
/// @pre mat_a and mat_b are distributed according to the grid.
template <Backend backend, Device device, class T>
void genToStd(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, device>& mat_a,
Expand All @@ -90,6 +94,8 @@ void genToStd(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, device>& m
DLAF_ASSERT(matrix::square_blocksize(mat_b), mat_b);
DLAF_ASSERT(mat_a.size() == mat_b.size(), mat_a, mat_b);
DLAF_ASSERT(mat_a.blockSize() == mat_b.blockSize(), mat_a, mat_b);
DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);
DLAF_ASSERT(matrix::single_tile_per_block(mat_b), mat_b);
DLAF_ASSERT(matrix::equal_process_grid(mat_a, grid), mat_a, grid);
DLAF_ASSERT(matrix::equal_process_grid(mat_b, grid), mat_b, grid);

Expand Down
4 changes: 4 additions & 0 deletions include/dlaf/eigensolver/reduction_to_band.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,14 @@ namespace dlaf::eigensolver {
///
/// @pre mat_a has a square size
/// @pre mat_a has a square block size
/// @pre mat_a has equal tile and block sizes
/// @pre mat_a is a local matrix
/// @pre mat_a.blockSize().rows() % band_size == 0
template <Backend B, Device D, class T>
Matrix<T, Device::CPU> reductionToBand(Matrix<T, D>& mat_a, const SizeType band_size) {
DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
DLAF_ASSERT(matrix::square_blocksize(mat_a), mat_a);
DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);

DLAF_ASSERT(matrix::local_matrix(mat_a), mat_a);

Expand Down Expand Up @@ -97,13 +99,15 @@ v v v v * *
///
/// @pre mat_a has a square size
/// @pre mat_a has a square block size
/// @pre mat_a has equal tile and block sizes
/// @pre mat_a is distributed according to @p grid
/// @pre mat_a.blockSize().rows() % band_size == 0
template <Backend B, Device D, class T>
Matrix<T, Device::CPU> reductionToBand(comm::CommunicatorGrid grid, Matrix<T, D>& mat_a,
const SizeType band_size) {
DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
DLAF_ASSERT(matrix::square_blocksize(mat_a), mat_a);
DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);
DLAF_ASSERT(matrix::equal_process_grid(mat_a, grid), mat_a, grid);

DLAF_ASSERT(band_size >= 2, band_size);
Expand Down
21 changes: 10 additions & 11 deletions include/dlaf/eigensolver/reduction_to_band/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
#include <dlaf/matrix/index.h>
#include <dlaf/matrix/matrix.h>
#include <dlaf/matrix/panel.h>
#include <dlaf/matrix/retiled_matrix.h>
#include <dlaf/matrix/tile.h>
#include <dlaf/matrix/views.h>
#include <dlaf/schedulers.h>
Expand Down Expand Up @@ -829,15 +828,15 @@ template <class T>
struct ComputePanelHelper<Backend::MC, Device::CPU, T> {
ComputePanelHelper(const std::size_t, matrix::Distribution) {}

void call(Matrix<T, Device::CPU>& mat_a, matrix::RetiledMatrix<T, Device::CPU>& mat_taus,
const SizeType j_sub, const matrix::SubPanelView& panel_view) {
void call(Matrix<T, Device::CPU>& mat_a, Matrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
const matrix::SubPanelView& panel_view) {
using red2band::local::computePanelReflectors;
computePanelReflectors(mat_a, mat_taus, j_sub, panel_view);
}

template <Device D, class CommSender, class TriggerSender>
void call(TriggerSender&& trigger, comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
Matrix<T, D>& mat_a, matrix::RetiledMatrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
Matrix<T, D>& mat_a, Matrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
const matrix::SubPanelView& panel_view) {
using red2band::distributed::computePanelReflectors;
computePanelReflectors(std::forward<TriggerSender>(trigger), rank_v0,
Expand All @@ -852,8 +851,8 @@ struct ComputePanelHelper<Backend::GPU, Device::GPU, T> {
ComputePanelHelper(const std::size_t n_workspaces, matrix::Distribution dist_a)
: panels_v(n_workspaces, dist_a) {}

void call(Matrix<T, Device::GPU>& mat_a, matrix::RetiledMatrix<T, Device::CPU>& mat_taus,
const SizeType j_sub, const matrix::SubPanelView& panel_view) {
void call(Matrix<T, Device::GPU>& mat_a, Matrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
const matrix::SubPanelView& panel_view) {
using red2band::local::computePanelReflectors;

namespace ex = pika::execution::experimental;
Expand All @@ -872,7 +871,7 @@ struct ComputePanelHelper<Backend::GPU, Device::GPU, T> {

template <Device D, class CommSender, class TriggerSender>
void call(TriggerSender&& trigger, comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
Matrix<T, D>& mat_a, matrix::RetiledMatrix<T, Device::CPU>& mat_taus, SizeType j_sub,
Matrix<T, D>& mat_a, Matrix<T, Device::CPU>& mat_taus, SizeType j_sub,
const matrix::SubPanelView& panel_view) {
auto& v = panels_v.nextResource();

Expand Down Expand Up @@ -962,8 +961,8 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(Matrix<T, D>& mat_a, const
if (nrefls == 0)
return mat_taus;

matrix::RetiledMatrix<T, Device::CPU> mat_taus_retiled(
mat_taus, LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
Matrix<T, Device::CPU> mat_taus_retiled =
mat_taus.retiledSubPipeline(LocalTileSize(mat_a.blockSize().cols() / band_size, 1));

const SizeType ntiles = (nrefls - 1) / band_size + 1;
DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());
Expand Down Expand Up @@ -1113,8 +1112,8 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid gri
if (nrefls == 0)
return mat_taus;

matrix::RetiledMatrix<T, Device::CPU> mat_taus_retiled(
mat_taus, LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
Matrix<T, Device::CPU> mat_taus_retiled =
mat_taus.retiledSubPipeline(LocalTileSize(mat_a.blockSize().cols() / band_size, 1));

const SizeType ntiles = (nrefls - 1) / band_size + 1;
DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());
Expand Down
Loading
Loading