eth-cscs · rasolca · Jul 19, 2023 · Jul 12, 2023 · Jul 12, 2023 · Jul 13, 2023
diff --git a/include/dlaf/auxiliary/norm.h b/include/dlaf/auxiliary/norm.h
@@ -37,14 +37,17 @@ namespace dlaf::auxiliary {
 ///
 /// @pre `A.blockSize().rows() == A.blockSize().cols()`,
 /// @pre @p A is distributed according to @p grid,
+/// @pre @p A has equal tile and block sizes,
 /// @return the norm @p norm_type of the Matrix @p A or 0 if `A.size().isEmpty()` (see LAPACK doc for
 /// additional info).
 template <Backend backend, Device device, class T>
 dlaf::BaseType<T> norm(comm::CommunicatorGrid grid, comm::Index2D rank, lapack::Norm norm_type,
                        blas::Uplo uplo, Matrix<const T, device>& A) {
   using dlaf::matrix::equal_process_grid;
+  using dlaf::matrix::single_tile_per_block;
 
   DLAF_ASSERT(equal_process_grid(A, grid), A, grid);
+  DLAF_ASSERT(single_tile_per_block(A), A);
 
   // LAPACK documentation specify that if any dimension is 0, the result is 0
   if (A.size().isEmpty())

diff --git a/include/dlaf/eigensolver/band_to_tridiag.h b/include/dlaf/eigensolver/band_to_tridiag.h
@@ -69,14 +69,16 @@ namespace eigensolver {
 /// @pre mat_a has a square size,
 /// @pre mat_a has a square block size,
 /// @pre band_size is a divisor of mat_a.blockSize().cols(), and band_size >= 2
-/// @pre mat_a is not distributed.
+/// @pre mat_a is not distributed,
+/// @pre mat_a has equal tile and block sizes.
 template <Backend B, Device D, class T>
 TridiagResult<T, Device::CPU> bandToTridiag(blas::Uplo uplo, SizeType band_size,
                                             Matrix<const T, D>& mat_a) {
   DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
   DLAF_ASSERT(matrix::square_blocksize(mat_a), mat_a);
   DLAF_ASSERT(mat_a.blockSize().rows() % band_size == 0, mat_a.blockSize().rows(), band_size);
   DLAF_ASSERT(matrix::local_matrix(mat_a), mat_a);
+  DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);
   DLAF_ASSERT(band_size >= 2, band_size);
 
   switch (uplo) {
@@ -140,13 +142,15 @@ TridiagResult<T, Device::CPU> bandToTridiag(blas::Uplo uplo, SizeType band_size,
 /// @pre mat_a has a square size,
 /// @pre mat_a has a square block size,
 /// @pre band_size is a divisor of mat_a.blockSize().cols() and band_size >= 2,
-/// @pre mat_a is distributed according to grid.
+/// @pre mat_a is distributed according to grid,
+/// @pre mat_a has equal tile and block sizes.
 template <Backend backend, Device device, class T>
 TridiagResult<T, Device::CPU> bandToTridiag(comm::CommunicatorGrid grid, blas::Uplo uplo,
                                             SizeType band_size, Matrix<const T, device>& mat_a) {
   DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
   DLAF_ASSERT(matrix::square_blocksize(mat_a), mat_a);
   DLAF_ASSERT(matrix::equal_process_grid(mat_a, grid), mat_a, grid);
+  DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);
   DLAF_ASSERT(band_size >= 2, band_size);
 
   // If the grid contains only one rank force local implementation.

diff --git a/include/dlaf/eigensolver/bt_band_to_tridiag.h b/include/dlaf/eigensolver/bt_band_to_tridiag.h
@@ -51,6 +51,8 @@ namespace dlaf::eigensolver {
 // @pre band_size is a divisor of mat_hh.blockSize().cols()
 // @pre mat_e is not distributed
 // @pre mat_hh is not distributed
+// @pre mat_e has equal tile and block sizes
+// @pre mat_hh has equal tile and block sizes
 template <Backend B, Device D, class T>
 void backTransformationBandToTridiag(const SizeType band_size, matrix::Matrix<T, D>& mat_e,
                                      matrix::Matrix<const T, Device::CPU>& mat_hh) {
@@ -63,6 +65,9 @@ void backTransformationBandToTridiag(const SizeType band_size, matrix::Matrix<T,
   DLAF_ASSERT(mat_hh.size().rows() == mat_e.size().rows(), mat_hh, mat_e);
   DLAF_ASSERT(mat_hh.blockSize().rows() == mat_e.blockSize().rows(), mat_hh, mat_e);
 
+  DLAF_ASSERT(matrix::single_tile_per_block(mat_e), mat_e);
+  DLAF_ASSERT(matrix::single_tile_per_block(mat_hh), mat_hh);
+
   DLAF_ASSERT(band_size >= 2, band_size);
   DLAF_ASSERT(mat_hh.blockSize().rows() % band_size == 0, mat_hh.blockSize(), band_size);
 

diff --git a/include/dlaf/eigensolver/bt_band_to_tridiag/impl.h b/include/dlaf/eigensolver/bt_band_to_tridiag/impl.h
@@ -39,7 +39,6 @@
 #include <dlaf/matrix/index.h>
 #include <dlaf/matrix/matrix.h>
 #include <dlaf/matrix/panel.h>
-#include <dlaf/matrix/retiled_matrix.h>
 #include <dlaf/matrix/tile.h>
 #include <dlaf/sender/policy.h>
 #include <dlaf/sender/traits.h>
@@ -621,7 +620,7 @@ void BackTransformationT2B<B, D, T>::call(const SizeType band_size, Matrix<T, D>
   const SizeType nsweeps = nrSweeps<T>(mat_hh.size().cols());
 
   const LocalTileSize tiles_per_block(mat_e.blockSize().rows() / b, 1);
-  matrix::RetiledMatrix<T, D> mat_e_rt(mat_e, tiles_per_block);
+  Matrix<T, D> mat_e_rt = mat_e.retiledSubPipeline(tiles_per_block);
 
   const auto& dist_hh = mat_hh.distribution();
   const auto& dist_e_rt = mat_e_rt.distribution();
@@ -746,7 +745,7 @@ void BackTransformationT2B<B, D, T>::call(comm::CommunicatorGrid grid, const Siz
   const SizeType group_size = getTuneParameters().bt_band_to_tridiag_hh_apply_group_size;
 
   const LocalTileSize tiles_per_block(mat_e.blockSize().rows() / b, 1);
-  matrix::RetiledMatrix<T, D> mat_e_rt(mat_e, tiles_per_block);
+  Matrix<T, D> mat_e_rt = mat_e.retiledSubPipeline(tiles_per_block);
 
   const auto& dist_hh = mat_hh.distribution();
   const auto& dist_e_rt = mat_e_rt.distribution();

diff --git a/include/dlaf/eigensolver/bt_reduction_to_band.h b/include/dlaf/eigensolver/bt_reduction_to_band.h
@@ -34,7 +34,9 @@ namespace eigensolver {
 /// @param mat_taus is the tau vector as returned by reductionToBand. The j-th element is the scaling
 /// factor for the j-th HH tranformation.
 /// @pre mat_c is not distributed,
-/// @pre mat_v is not distributed.
+/// @pre mat_v is not distributed,
+/// @pre mat_c has equal tile and block sizes,
+/// @pre mat_v has equal tile and block sizes.
 template <Backend backend, Device device, class T>
 void backTransformationReductionToBand(const SizeType b, Matrix<T, device>& mat_c,
                                        Matrix<const T, device>& mat_v,
@@ -45,6 +47,8 @@ void backTransformationReductionToBand(const SizeType b, Matrix<T, device>& mat_
   DLAF_ASSERT(square_blocksize(mat_v), mat_v);
   DLAF_ASSERT(mat_c.size().rows() == mat_v.size().rows(), mat_c, mat_v);
   DLAF_ASSERT(mat_c.blockSize().rows() == mat_v.blockSize().rows(), mat_c, mat_v);
+  DLAF_ASSERT(single_tile_per_block(mat_c), mat_c);
+  DLAF_ASSERT(single_tile_per_block(mat_v), mat_v);
 
   [[maybe_unused]] auto nr_reflectors_blocks = [&b, &mat_v]() {
     const SizeType m = mat_v.size().rows();
@@ -68,7 +72,9 @@ void backTransformationReductionToBand(const SizeType b, Matrix<T, device>& mat_
 /// @param mat_taus is the tau vector as returned by reductionToBand. The j-th element is the scaling
 /// factor for the j-th HH tranformation.
 /// @pre mat_c is distributed,
-/// @pre mat_v is distributed according to grid.
+/// @pre mat_v is distributed according to grid,
+/// @pre mat_c has equal tile and block sizes,
+/// @pre mat_v has equal tile and block sizes.
 template <Backend backend, Device device, class T>
 void backTransformationReductionToBand(comm::CommunicatorGrid grid, const SizeType b,
                                        Matrix<T, device>& mat_c, Matrix<const T, device>& mat_v,
@@ -79,6 +85,8 @@ void backTransformationReductionToBand(comm::CommunicatorGrid grid, const SizeTy
   DLAF_ASSERT(square_blocksize(mat_v), mat_v);
   DLAF_ASSERT(mat_c.size().rows() == mat_v.size().rows(), mat_c, mat_v);
   DLAF_ASSERT(mat_c.blockSize().rows() == mat_v.blockSize().rows(), mat_c, mat_v);
+  DLAF_ASSERT(single_tile_per_block(mat_c), mat_c);
+  DLAF_ASSERT(single_tile_per_block(mat_v), mat_v);
 
   [[maybe_unused]] auto nr_reflectors_blocks = [&b, &mat_v]() {
     const SizeType m = mat_v.size().rows();

diff --git a/include/dlaf/eigensolver/eigensolver.h b/include/dlaf/eigensolver/eigensolver.h
@@ -50,6 +50,9 @@ void eigensolver(blas::Uplo uplo, Matrix<T, D>& mat, Matrix<BaseType<T>, D>& eig
   DLAF_ASSERT(square_blocksize(eigenvectors), eigenvectors);
   DLAF_ASSERT(eigenvectors.size() == mat.size(), eigenvectors, mat);
   DLAF_ASSERT(eigenvectors.blockSize() == mat.blockSize(), eigenvectors, mat);
+  DLAF_ASSERT(single_tile_per_block(mat), mat);
+  DLAF_ASSERT(single_tile_per_block(eigenvalues), eigenvalues);
+  DLAF_ASSERT(single_tile_per_block(eigenvectors), eigenvectors);
 
   internal::Eigensolver<B, D, T>::call(uplo, mat, eigenvalues, eigenvectors);
 }
@@ -107,6 +110,9 @@ void eigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, D>& mat
   DLAF_ASSERT(square_blocksize(eigenvectors), eigenvectors);
   DLAF_ASSERT(eigenvectors.size() == mat.size(), eigenvectors, mat);
   DLAF_ASSERT(eigenvectors.blockSize() == mat.blockSize(), eigenvectors, mat);
+  DLAF_ASSERT(single_tile_per_block(mat), mat);
+  DLAF_ASSERT(single_tile_per_block(eigenvalues), eigenvalues);
+  DLAF_ASSERT(single_tile_per_block(eigenvectors), eigenvectors);
 
   internal::Eigensolver<B, D, T>::call(grid, uplo, mat, eigenvalues, eigenvectors);
 }

diff --git a/include/dlaf/eigensolver/gen_eigensolver.h b/include/dlaf/eigensolver/gen_eigensolver.h
@@ -58,6 +58,10 @@ void genEigensolver(blas::Uplo uplo, Matrix<T, D>& mat_a, Matrix<T, D>& mat_b,
               eigenvectors);
   DLAF_ASSERT(eigenvectors.size() == mat_a.size(), eigenvectors, mat_a);
   DLAF_ASSERT(eigenvectors.blockSize() == mat_a.blockSize(), eigenvectors, mat_a);
+  DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);
+  DLAF_ASSERT(matrix::single_tile_per_block(mat_b), mat_b);
+  DLAF_ASSERT(matrix::single_tile_per_block(eigenvalues), eigenvalues);
+  DLAF_ASSERT(matrix::single_tile_per_block(eigenvectors), eigenvectors);
 
   internal::GenEigensolver<B, D, T>::call(uplo, mat_a, mat_b, eigenvalues, eigenvectors);
 }
@@ -139,6 +143,10 @@ void genEigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, D>&
               eigenvectors);
   DLAF_ASSERT(eigenvectors.size() == mat_a.size(), eigenvectors, mat_a);
   DLAF_ASSERT(eigenvectors.blockSize() == mat_a.blockSize(), eigenvectors, mat_a);
+  DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);
+  DLAF_ASSERT(matrix::single_tile_per_block(mat_b), mat_b);
+  DLAF_ASSERT(matrix::single_tile_per_block(eigenvalues), eigenvalues);
+  DLAF_ASSERT(matrix::single_tile_per_block(eigenvectors), eigenvectors);
 
   internal::GenEigensolver<B, D, T>::call(grid, uplo, mat_a, mat_b, eigenvalues, eigenvectors);
 }

diff --git a/include/dlaf/eigensolver/gen_to_std.h b/include/dlaf/eigensolver/gen_to_std.h
@@ -38,6 +38,7 @@ namespace eigensolver {
 /// Note: B should be modifiable as the diagonal tiles might be temporarly modified during the calculation.
 /// @pre mat_a and mat_b have the same square size,
 /// @pre mat_a and mat_b have the same square block size,
+/// @pre mat_a and mat_b have the same tile and block sizes,
 /// @pre mat_a and mat_b are not distributed.
 template <Backend backend, Device device, class T>
 void genToStd(blas::Uplo uplo, Matrix<T, device>& mat_a, Matrix<T, device>& mat_b) {
@@ -47,6 +48,8 @@ void genToStd(blas::Uplo uplo, Matrix<T, device>& mat_a, Matrix<T, device>& mat_
   DLAF_ASSERT(matrix::square_blocksize(mat_b), mat_b);
   DLAF_ASSERT(mat_a.size() == mat_b.size(), mat_a, mat_b);
   DLAF_ASSERT(mat_a.blockSize() == mat_b.blockSize(), mat_a, mat_b);
+  DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);
+  DLAF_ASSERT(matrix::single_tile_per_block(mat_b), mat_b);
   DLAF_ASSERT(matrix::local_matrix(mat_a), mat_a);
   DLAF_ASSERT(matrix::local_matrix(mat_b), mat_b);
 
@@ -80,6 +83,7 @@ void genToStd(blas::Uplo uplo, Matrix<T, device>& mat_a, Matrix<T, device>& mat_
 /// Note: B should be modifiable as the diagonal tiles might be temporarly modified during the calculation.
 /// @pre mat_a and mat_b have the same square size,
 /// @pre mat_a and mat_b have the same square block size,
+/// @pre mat_a and mat_b have the same tile and block sizes,
 /// @pre mat_a and mat_b are distributed according to the grid.
 template <Backend backend, Device device, class T>
 void genToStd(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, device>& mat_a,
@@ -90,6 +94,8 @@ void genToStd(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, device>& m
   DLAF_ASSERT(matrix::square_blocksize(mat_b), mat_b);
   DLAF_ASSERT(mat_a.size() == mat_b.size(), mat_a, mat_b);
   DLAF_ASSERT(mat_a.blockSize() == mat_b.blockSize(), mat_a, mat_b);
+  DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);
+  DLAF_ASSERT(matrix::single_tile_per_block(mat_b), mat_b);
   DLAF_ASSERT(matrix::equal_process_grid(mat_a, grid), mat_a, grid);
   DLAF_ASSERT(matrix::equal_process_grid(mat_b, grid), mat_b, grid);
 

diff --git a/include/dlaf/eigensolver/reduction_to_band.h b/include/dlaf/eigensolver/reduction_to_band.h
@@ -32,12 +32,14 @@ namespace dlaf::eigensolver {
 ///
 /// @pre mat_a has a square size
 /// @pre mat_a has a square block size
+/// @pre mat_a has equal tile and block sizes
 /// @pre mat_a is a local matrix
 /// @pre mat_a.blockSize().rows() % band_size == 0
 template <Backend B, Device D, class T>
 Matrix<T, Device::CPU> reductionToBand(Matrix<T, D>& mat_a, const SizeType band_size) {
   DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
   DLAF_ASSERT(matrix::square_blocksize(mat_a), mat_a);
+  DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);
 
   DLAF_ASSERT(matrix::local_matrix(mat_a), mat_a);
 
@@ -97,13 +99,15 @@ v v v v * *
 ///
 /// @pre mat_a has a square size
 /// @pre mat_a has a square block size
+/// @pre mat_a has equal tile and block sizes
 /// @pre mat_a is distributed according to @p grid
 /// @pre mat_a.blockSize().rows() % band_size == 0
 template <Backend B, Device D, class T>
 Matrix<T, Device::CPU> reductionToBand(comm::CommunicatorGrid grid, Matrix<T, D>& mat_a,
                                        const SizeType band_size) {
   DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
   DLAF_ASSERT(matrix::square_blocksize(mat_a), mat_a);
+  DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);
   DLAF_ASSERT(matrix::equal_process_grid(mat_a, grid), mat_a, grid);
 
   DLAF_ASSERT(band_size >= 2, band_size);

diff --git a/include/dlaf/eigensolver/reduction_to_band/impl.h b/include/dlaf/eigensolver/reduction_to_band/impl.h
@@ -42,7 +42,6 @@
 #include <dlaf/matrix/index.h>
 #include <dlaf/matrix/matrix.h>
 #include <dlaf/matrix/panel.h>
-#include <dlaf/matrix/retiled_matrix.h>
 #include <dlaf/matrix/tile.h>
 #include <dlaf/matrix/views.h>
 #include <dlaf/schedulers.h>
@@ -829,15 +828,15 @@ template <class T>
 struct ComputePanelHelper<Backend::MC, Device::CPU, T> {
   ComputePanelHelper(const std::size_t, matrix::Distribution) {}
 
-  void call(Matrix<T, Device::CPU>& mat_a, matrix::RetiledMatrix<T, Device::CPU>& mat_taus,
-            const SizeType j_sub, const matrix::SubPanelView& panel_view) {
+  void call(Matrix<T, Device::CPU>& mat_a, Matrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
+            const matrix::SubPanelView& panel_view) {
     using red2band::local::computePanelReflectors;
     computePanelReflectors(mat_a, mat_taus, j_sub, panel_view);
   }
 
   template <Device D, class CommSender, class TriggerSender>
   void call(TriggerSender&& trigger, comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
-            Matrix<T, D>& mat_a, matrix::RetiledMatrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
+            Matrix<T, D>& mat_a, Matrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
             const matrix::SubPanelView& panel_view) {
     using red2band::distributed::computePanelReflectors;
     computePanelReflectors(std::forward<TriggerSender>(trigger), rank_v0,
@@ -852,8 +851,8 @@ struct ComputePanelHelper<Backend::GPU, Device::GPU, T> {
   ComputePanelHelper(const std::size_t n_workspaces, matrix::Distribution dist_a)
       : panels_v(n_workspaces, dist_a) {}
 
-  void call(Matrix<T, Device::GPU>& mat_a, matrix::RetiledMatrix<T, Device::CPU>& mat_taus,
-            const SizeType j_sub, const matrix::SubPanelView& panel_view) {
+  void call(Matrix<T, Device::GPU>& mat_a, Matrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
+            const matrix::SubPanelView& panel_view) {
     using red2band::local::computePanelReflectors;
 
     namespace ex = pika::execution::experimental;
@@ -872,7 +871,7 @@ struct ComputePanelHelper<Backend::GPU, Device::GPU, T> {
 
   template <Device D, class CommSender, class TriggerSender>
   void call(TriggerSender&& trigger, comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
-            Matrix<T, D>& mat_a, matrix::RetiledMatrix<T, Device::CPU>& mat_taus, SizeType j_sub,
+            Matrix<T, D>& mat_a, Matrix<T, Device::CPU>& mat_taus, SizeType j_sub,
             const matrix::SubPanelView& panel_view) {
     auto& v = panels_v.nextResource();
 
@@ -962,8 +961,8 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(Matrix<T, D>& mat_a, const
   if (nrefls == 0)
     return mat_taus;
 
-  matrix::RetiledMatrix<T, Device::CPU> mat_taus_retiled(
-      mat_taus, LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
+  Matrix<T, Device::CPU> mat_taus_retiled =
+      mat_taus.retiledSubPipeline(LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
 
   const SizeType ntiles = (nrefls - 1) / band_size + 1;
   DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());
@@ -1113,8 +1112,8 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid gri
   if (nrefls == 0)
     return mat_taus;
 
-  matrix::RetiledMatrix<T, Device::CPU> mat_taus_retiled(
-      mat_taus, LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
+  Matrix<T, Device::CPU> mat_taus_retiled =
+      mat_taus.retiledSubPipeline(LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
 
   const SizeType ntiles = (nrefls - 1) / band_size + 1;
   DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());