=@aIkd*Q_OfMB-z&
z*uGiWzAPs1YS+rJ)?VY{JG?fxO^=6v=Py62&$IRR>WcLjdT#u?xh}K+nX&rWwKwJb
zOCQ{ez5Mas_KDixn|D9kv~c3?XOrG0`b+limAGGDroQFNl84v#9PR06Qhl0kdE)x~
zPY&AR>vpPK&sSL)(*5kcgjZnczD*CC-Uj-!^{QB29m$Mnm}Wd3H=^8D{o77r%RH
zT-K{H_r+r%DXVqCV-hf>c^0j7Q1;Xl%}_bH6cl||gwNWX|IC|m;ml{7`wsOC=7&NR
Usu_xMfCT}Ar>mdKI;Vst0C5ZrKL7v#
diff --git a/master/classes.html b/master/classes.html
index 8262688254..1ccedec598 100644
--- a/master/classes.html
+++ b/master/classes.html
@@ -111,7 +111,7 @@
QR (dlaf::factorization::internal)QR_Tfactor (dlaf::factorization::internal)
- R
-- ReductionToBand (dlaf::eigensolver::internal)
- RetiledMatrix (dlaf::matrix)
- RoundRobin (dlaf::common)
+ReductionToBand (dlaf::eigensolver::internal)RoundRobin (dlaf::common)
- S
- ScopedInitializer (dlaf)
- SenderSingleValueTypeImpl (dlaf::internal)
- SenderSingleValueTypeImpl< TypeList< TypeList< pika::execution::experimental::async_rw_mutex_access_wrapper< RWType, RType, pika::execution::experimental::async_rw_mutex_access_type::read > > > > (dlaf::internal)
- SenderSingleValueTypeImpl< TypeList< TypeList< pika::execution::experimental::async_rw_mutex_access_wrapper< RWType, RType, pika::execution::experimental::async_rw_mutex_access_type::readwrite > > > > (dlaf::internal)
- SenderSingleValueTypeImpl< TypeList< TypeList< std::reference_wrapper< T > > > > (dlaf::internal)
- SenderSingleValueTypeImpl< TypeList< TypeList< T > > > (dlaf::internal)
- SingleThreadedBlasScope (dlaf::common::internal)
- Size2D (dlaf::common)
- source_location (dlaf::common::internal)
- SubMatrixView (dlaf::matrix)
- SubPanelView (dlaf::matrix)
- Matrix< const T, D >::SubPipelineTag (dlaf::matrix)
- SubTileSpec (dlaf::matrix)
- SweepWorker (dlaf::eigensolver::internal)
- SweepWorkerDist (dlaf::eigensolver::internal)
diff --git a/master/copy_8h_source.html b/master/copy_8h_source.html
index e97f2399b7..6d6a219a23 100644
--- a/master/copy_8h_source.html
+++ b/master/copy_8h_source.html
@@ -131,7 +131,7 @@
-ReadWriteSenderType readwrite(const LocalTileIndex &index) noexcept
Definition: matrix.h:121
+ReadWriteSenderType readwrite(const LocalTileIndex &index) noexcept
Definition: matrix.h:122
diff --git a/master/dir_21834082df7a318c018c9cc963be86ec.html b/master/dir_21834082df7a318c018c9cc963be86ec.html
index bdf58ac8c6..d213edd02b 100644
--- a/master/dir_21834082df7a318c018c9cc963be86ec.html
+++ b/master/dir_21834082df7a318c018c9cc963be86ec.html
@@ -99,8 +99,6 @@
|
file | print_numpy.h [code] |
|
-file | retiled_matrix.h [code] |
- |
file | tile.h [code] |
|
file | util_distribution.h [code] |
diff --git a/master/eigensolver_2bt__band__to__tridiag_2impl_8h_source.html b/master/eigensolver_2bt__band__to__tridiag_2impl_8h_source.html
index 9766a50f56..e8ab1d2087 100644
--- a/master/eigensolver_2bt__band__to__tridiag_2impl_8h_source.html
+++ b/master/eigensolver_2bt__band__to__tridiag_2impl_8h_source.html
@@ -110,966 +110,965 @@
-
-
- 44 #include <dlaf/sender/policy.h>
- 45 #include <dlaf/sender/traits.h>
- 46 #include <dlaf/sender/transform.h>
- 47 #include <dlaf/sender/when_all_lift.h>
-
-
-
-
-
- 53 namespace dlaf::eigensolver::internal {
-
- 55 namespace bt_tridiag {
-
-
- 58 matrix::Tile<T, Device::CPU> setupVWellFormed(
const SizeType b,
- 59 const matrix::Tile<const T, Device::CPU>& tile_hh,
- 60 matrix::Tile<T, Device::CPU> tile_v) {
-
-
-
- 64 common::internal::SingleThreadedBlasScope single;
-
-
-
-
- 69 const auto k = tile_v.size().cols();
-
-
- 72 for (SizeType j = 0; j < k; ++j) {
- 73 const auto compact_refl_size =
- 74 std::min<SizeType>(tile_v.size().rows() - (1 + j), tile_hh.size().rows() - 1);
-
-
- 77 if (compact_refl_size == 0)
-
-
- 80 lacpy(blas::Uplo::General, compact_refl_size, 1, tile_hh.ptr({1, j}), tile_hh.ld(),
- 81 tile_v.ptr({1 + j, j}), tile_v.ld());
-
-
-
-
-
-
-
- 89 laset(blas::Uplo::Upper, tile_v.size().rows(), k, T(0), T(1), tile_v.ptr({0, 0}), tile_v.ld());
-
- 91 if (tile_v.size().rows() > b)
- 92 laset(blas::Uplo::Lower, tile_v.size().rows() - b, k - 1, T(0), T(0), tile_v.ptr({b, 0}),
-
-
-
-
-
-
- 99 void computeTFactor(
const matrix::Tile<const T, Device::CPU>& tile_taus,
- 100 const matrix::Tile<const T, Device::CPU>& tile_v,
- 101 const matrix::Tile<T, Device::CPU>& tile_t) {
- 102 using namespace lapack;
-
- 104 common::internal::SingleThreadedBlasScope single;
-
-
-
- 108 taus.resize(
to_sizet(tile_v.size().cols()));
- 109 for (SizeType i = 0; i <
to_SizeType(taus.size()); ++i)
- 110 taus[
to_sizet(i)] = tile_taus({0, i});
-
- 112 const auto n = tile_v.size().rows();
- 113 const auto k = tile_v.size().cols();
- 114 larft(Direction::Forward, StoreV::Columnwise, n, k, tile_v.ptr(), tile_v.ld(), taus.data(),
- 115 tile_t.ptr(), tile_t.ld());
-
-
-
- 119 std::tuple<matrix::Tile<T, Device::CPU>, matrix::Tile<T, Device::CPU>> computeVT(
- 120 const SizeType b,
const matrix::Tile<const T, Device::CPU>& tile_hh,
const SizeType hhr_nb,
- 121 matrix::Tile<T, Device::CPU> tile_v, matrix::Tile<T, Device::CPU> tile_t) {
- 122 auto tile_v2 = setupVWellFormed(b, tile_hh, std::move(tile_v));
- 123 for (SizeType j = 0; j < tile_v2.size().cols(); j += hhr_nb) {
- 124 const SizeType jb = std::min(hhr_nb, tile_v2.size().cols() - j);
- 125 const SizeType ib = std::min(jb + b - 1, tile_v2.size().rows() - j);
- 126 auto subtile_t = tile_t.subTileReference({{j, j}, {jb, jb}});
- 127 auto subtile_hh = tile_hh.subTileReference({{0, j}, {1, jb}});
- 128 auto subtile_v = tile_v2.subTileReference({{j, j}, {ib, jb}});
- 129 computeTFactor(subtile_hh, subtile_v, subtile_t);
-
- 131 return std::make_tuple(std::move(tile_v2), std::move(tile_t));
-
-
-
- 135 std::tuple<matrix::Tile<T, Device::CPU>, matrix::Tile<T, Device::CPU>> computeVW(
- 136 const SizeType b,
const matrix::Tile<const T, Device::CPU>& tile_hh,
const SizeType hhr_nb,
- 137 matrix::Tile<T, Device::CPU> tile_v, matrix::Tile<T, Device::CPU> tile_t,
- 138 matrix::Tile<T, Device::CPU> tile_w) {
- 139 using namespace blas;
-
- 141 auto [tile_v2, tile_t2] = computeVT(b, tile_hh, hhr_nb, std::move(tile_v), std::move(tile_t));
-
- 143 for (SizeType j = 0; j < tile_v2.size().cols(); j += hhr_nb) {
- 144 const SizeType jb = std::min(hhr_nb, tile_v2.size().cols() - j);
- 145 const SizeType ib = std::min(jb + b - 1, tile_v2.size().rows() - j);
- 146 auto subtile_t = tile_t2.subTileReference({{j, j}, {jb, jb}});
- 147 auto subtile_v = tile_v2.subTileReference({{j, j}, {ib, jb}});
- 148 auto subtile_w = tile_w.subTileReference({{j, j}, {ib, jb}});
-
- 150 dlaf::tile::internal::trmm3(Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, T(1), subtile_t,
- 151 subtile_v, subtile_w);
-
- 153 return std::make_tuple(std::move(tile_v2), std::move(tile_w));
-
-
- 156 template <
class Tile,
class CTile>
- 157 std::tuple<CTile, CTile, Tile, Tile> applyHHToSingleTileRowSubtileHelper(
- 158 const SizeType j,
const SizeType jb,
const CTile& tile_v,
const CTile& tile_w,
const Tile& tile_w2,
- 159 const Tile& tile_e) {
- 160 DLAF_ASSERT_HEAVY(tile_v.size() == tile_w.size(), tile_v, tile_w);
- 161 DLAF_ASSERT_HEAVY(tile_e.size().rows() - 1 == tile_v.size().rows(), tile_e, tile_v);
- 162 DLAF_ASSERT_HEAVY(tile_e.size().cols() == tile_w2.size().cols(), tile_e, tile_w2);
-
- 164 const SizeType ib = tile_v.size().rows() - j;
- 165 auto subtile_v = tile_v.subTileReference({{j, j}, {ib, jb}});
- 166 auto subtile_w = tile_w.subTileReference({{j, j}, {ib, jb}});
- 167 auto subtile_w2 = tile_w2.subTileReference({{0, 0}, {jb, tile_w2.size().cols()}});
- 168 auto subtile_e = tile_e.subTileReference({{j + 1, 0}, tile_e.size() - TileElementSize{j + 1, 0}});
-
- 170 return {std::move(subtile_v), std::move(subtile_w), std::move(subtile_w2), std::move(subtile_e)};
-
-
- 173 template <Backend B,
class T>
-
-
-
-
-
-
-
-
- 182 using namespace blas;
- 183 using tile::internal::gemm;
-
-
-
- 187 for (SizeType j = (util::ceilDiv(tile_v.size().cols(), hhr_nb) - 1) * hhr_nb; j >= 0; j -= hhr_nb) {
- 188 const SizeType jb = std::min(hhr_nb, tile_v.size().cols() - j);
- 189 auto [subtile_v, subtile_w, subtile_w2, subtile_e] =
- 190 applyHHToSingleTileRowSubtileHelper(j, jb, tile_v, tile_w, tile_w2, tile_e);
-
-
- 193 gemm(Op::ConjTrans, Op::NoTrans, T(1), subtile_v, subtile_e, T(0), subtile_w2);
-
- 195 gemm(Op::NoTrans, Op::NoTrans, T(-1), subtile_w, subtile_w2, T(1), subtile_e);
-
-
-
-
-
-
-
- 203 void operator()(cublasHandle_t handle,
const SizeType hhr_nb,
-
-
-
-
- 208 using namespace blas;
- 209 using tile::internal::gemm;
-
-
-
- 213 for (SizeType j = (util::ceilDiv(tile_v.size().cols(), hhr_nb) - 1) * hhr_nb; j >= 0; j -= hhr_nb) {
- 214 const SizeType jb = std::min(hhr_nb, tile_v.size().cols() - j);
- 215 auto [subtile_v, subtile_w, subtile_w2, subtile_e] =
- 216 applyHHToSingleTileRowSubtileHelper(j, jb, tile_v, tile_w, tile_w2, tile_e);
-
-
- 219 gemm(handle, Op::ConjTrans, Op::NoTrans, T(1), subtile_v, subtile_e, T(0), subtile_w2);
-
- 221 gemm(handle, Op::NoTrans, Op::NoTrans, T(-1), subtile_w, subtile_w2, T(1), subtile_e);
-
-
-
-
-
- 227 template <
class Tile,
class CTile>
- 228 std::tuple<CTile, CTile, CTile, CTile, Tile, Tile, Tile> applyHHToDoubleTileRowSubtileHelper(
- 229 const SizeType j,
const SizeType jb,
const CTile& tile_v,
const CTile& tile_w,
const Tile& tile_w2,
- 230 const Tile& tile_e_top,
const Tile& tile_e_bottom) {
- 231 DLAF_ASSERT_HEAVY(tile_v.size() == tile_w.size(), tile_v, tile_w);
- 232 DLAF_ASSERT_HEAVY(tile_e_top.size().rows() + tile_e_bottom.size().rows() - 1 == tile_v.size().rows(),
- 233 tile_e_top, tile_e_bottom, tile_v);
- 234 DLAF_ASSERT_HEAVY(tile_e_top.size().cols() == tile_w2.size().cols(), tile_e_top, tile_w2);
- 235 DLAF_ASSERT_HEAVY(tile_e_bottom.size().cols() == tile_w2.size().cols(), tile_e_bottom, tile_w2);
-
-
- 238 tile_e_top.subTileReference({{j + 1, 0}, tile_e_top.size() -
TileElementSize{j + 1, 0}});
- 239 auto subtile_e_bottom = tile_e_bottom.subTileReference(
-
- 241 TileElementSize{std::min(tile_e_bottom.size().rows(), j + jb), tile_e_bottom.size().cols()}});
-
- 243 matrix::SubTileSpec spec_top{{j, j}, {subtile_e_top.size().rows(), jb}};
- 244 matrix::SubTileSpec spec_bottom{{tile_e_top.size().rows() - 1, j},
- 245 {subtile_e_bottom.size().rows(), jb}};
-
- 247 auto subtile_v_top = tile_v.subTileReference(spec_top);
- 248 auto subtile_v_bottom = tile_v.subTileReference(spec_bottom);
- 249 auto subtile_w_top = tile_w.subTileReference(spec_top);
- 250 auto subtile_w_bottom = tile_w.subTileReference(spec_bottom);
- 251 auto subtile_w2 = tile_w2.subTileReference({{0, 0}, {jb, tile_w2.size().cols()}});
-
- 253 return {std::move(subtile_v_top), std::move(subtile_v_bottom), std::move(subtile_w_top),
- 254 std::move(subtile_w_bottom), std::move(subtile_w2), std::move(subtile_e_top),
- 255 std::move(subtile_e_bottom)};
-
-
- 258 template <Backend B,
class T>
-
-
-
-
-
-
-
-
-
- 268 using namespace blas;
- 269 using tile::internal::gemm;
-
-
-
- 273 for (SizeType j = (util::ceilDiv(tile_v.size().cols(), hhr_nb) - 1) * hhr_nb; j >= 0; j -= hhr_nb) {
- 274 const SizeType jb = std::min(hhr_nb, tile_v.size().cols() - j);
- 275 auto [subtile_v_top, subtile_v_bottom, subtile_w_top, subtile_w_bottom, subtile_w2, subtile_e_top,
-
- 277 applyHHToDoubleTileRowSubtileHelper(j, jb, tile_v, tile_w, tile_w2, tile_e_top, tile_e_bottom);
-
-
- 280 gemm(Op::ConjTrans, Op::NoTrans, T(1), subtile_v_top, subtile_e_top, T(0), subtile_w2);
- 281 gemm(Op::ConjTrans, Op::NoTrans, T(1), subtile_v_bottom, subtile_e_bottom, T(1), subtile_w2);
-
- 283 gemm(Op::NoTrans, Op::NoTrans, T(-1), subtile_w_top, subtile_w2, T(1), subtile_e_top);
- 284 gemm(Op::NoTrans, Op::NoTrans, T(-1), subtile_w_bottom, subtile_w2, T(1), subtile_e_bottom);
-
-
-
-
-
-
-
- 292 void operator()(cublasHandle_t handle,
const SizeType hhr_nb,
-
-
-
-
-
- 298 using namespace blas;
- 299 using tile::internal::gemm;
-
-
-
- 303 for (SizeType j = (util::ceilDiv(tile_v.size().cols(), hhr_nb) - 1) * hhr_nb; j >= 0; j -= hhr_nb) {
- 304 const SizeType jb = std::min(hhr_nb, tile_v.size().cols() - j);
- 305 auto [subtile_v_top, subtile_v_bottom, subtile_w_top, subtile_w_bottom, subtile_w2, subtile_e_top,
-
- 307 applyHHToDoubleTileRowSubtileHelper(j, jb, tile_v, tile_w, tile_w2, tile_e_top, tile_e_bottom);
-
-
- 310 gemm(handle, Op::ConjTrans, Op::NoTrans, T(1), subtile_v_top, subtile_e_top, T(0), subtile_w2);
- 311 gemm(handle, Op::ConjTrans, Op::NoTrans, T(1), subtile_v_bottom, subtile_e_bottom, T(1),
-
-
- 314 gemm(handle, Op::NoTrans, Op::NoTrans, T(-1), subtile_w_top, subtile_w2, T(1), subtile_e_top);
- 315 gemm(handle, Op::NoTrans, Op::NoTrans, T(-1), subtile_w_bottom, subtile_w2, T(1),
-
-
-
-
-
-
-
-
-
-
- 326 {std::min(b, dist_hh.size().rows() - offset.row()),
- 327 std::min(b, dist_hh.size().cols() - offset.col())}},
-
-
-
-
-
-
-
-
-
- 337 rows_v_ = std::min(2 * b, dist_e.size().rows() - offset.row()) - 1;
- 338 rows_v_top_ = std::min(rows_v_, b - 1);
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 363 across_tiles_ = !(index_e_ == dist_e.
nrTiles().rows() - 1);
-
- 365 across_ranks_ = across_tiles_ && (dist_e.
rankGlobalTile<Coord::Row>(index_e_) !=
-
-
-
-
- 370 bool affectsMultipleTiles()
const noexcept {
- 371 return across_tiles_;
-
-
-
- 375 bool affectsMultipleRanks()
const noexcept {
- 376 return across_ranks_;
-
-
-
-
-
-
-
-
-
-
- 387 return {{0, 0}, input_spec_.size};
-
-
-
-
-
- 393 return {{0, 0}, {rows_v_, nrefls_}};
-
-
-
-
-
-
- 400 return {{0, 0}, {rows_v_top_, nrefls_}};
- 401 DLAF_ASSERT_MODERATE(affectsMultipleTiles(), affectsMultipleTiles());
- 402 return {{rows_v_top_, 0}, {rows_v_ - rows_v_top_, nrefls_}};
-
-
-
-
- 407 return {{0, 0}, {nrefls_, nrefls_}};
-
-
-
- 411 return {{0, 0}, {nrefls_, cols}};
-
-
-
- 415 return {index_e_, j};
-
-
-
- 419 DLAF_ASSERT_MODERATE(affectsMultipleTiles(), affectsMultipleTiles());
- 420 return {index_e_ + 1, j};
-
-
-
-
-
-
-
- 428 SizeType rows_v_top_;
-
-
-
-
-
-
-
-
- 449 : dist_hh(dist_hh), b(b), mb(dist_hh.blockSize().rows()), helper(helper), ij(ij),
-
- 451 rank = dist_hh.rankIndex();
-
- 453 n_ws_per_block =
to_SizeType(
static_cast<size_t>(std::ceil(mb / b / 2.0f)) + 1);
-
-
-
- 457 return (rankHH.row() + 1) % dist_hh.commGridSize().rows();
-
-
- 460 bool isInvolved()
const {
- 461 const bool isSameRow = rank.row() == rankHH.row();
- 462 const bool isPartnerRow = rank.row() == rankRowPartner();
- 463 return isSameRow || (isPartnerRow && helper.affectsMultipleRanks());
-
-
-
- 467 const SizeType row = [&]() -> SizeType {
- 468 if (rank.row() == rankHH.row()) {
-
- 470 const SizeType intra_idx = 1 + (ij_b_row % (mb / b)) / 2;
- 471 DLAF_ASSERT_HEAVY(intra_idx < n_ws_per_block, intra_idx, n_ws_per_block);
-
-
-
- 475 DLAF_ASSERT_HEAVY(helper.affectsMultipleRanks() && (rank.row() == rankRowPartner()),
- 476 helper.affectsMultipleRanks(), rank.row(), rankRowPartner());
-
-
-
-
-
-
-
-
-
-
-
-
- 489 SizeType n_ws_per_block;
-
-
-
-
-
-
-
-
-
-
- 500 template <Backend B, Device D,
class T>
-
-
-
-
- 505 static constexpr Backend B = Backend::MC;
- 506 static constexpr Device D = Device::CPU;
-
-
-
- 510 template <
class SenderHH>
-
-
-
- 514 namespace ex = pika::execution::experimental;
-
- 516 return dlaf::internal::whenAllLift(b, std::forward<SenderHH>(tile_hh), nb_apply,
- 517 splitTile(mat_v.
readwrite(ij), helper.specHH()),
- 518 splitTile(mat_t.
readwrite(ij), helper.specT()),
- 519 splitTile(mat_w.
readwrite(ij), helper.specHH())) |
-
-
-
-
-
-
-
-
-
-
-
- 531 static constexpr Backend B = Backend::GPU;
- 532 static constexpr Device D = Device::GPU;
-
-
-
- 536 : b(b), t_panels_h(n_workspaces, dist_t), w_panels_h(n_workspaces, dist_w) {}
-
- 538 template <
class SenderHH>
-
-
-
- 542 namespace ex = pika::execution::experimental;
-
- 544 auto& mat_v_h = w_panels_h.nextResource();
- 545 auto& mat_t_h = t_panels_h.nextResource();
-
-
-
-
- 550 auto [tile_v_h, tile_t_h] =
- 551 dlaf::internal::whenAllLift(b, std::forward<SenderHH>(tile_hh), hhr_nb,
- 552 splitTile(mat_v_h.readwrite(ij), helper.specHH()),
- 553 splitTile(mat_t_h.readwrite(ij_t), t_spec)) |
-
-
-
- 557 auto copyVTandComputeW =
-
-
-
-
- 562 whip::stream_t stream;
- 563 DLAF_GPUBLAS_CHECK_ERROR(cublasGetStream(handle, &stream));
-
- 565 matrix::internal::copy(tile_v_h, tile_v, stream);
- 566 matrix::internal::copy(tile_t_h, tile_t, stream);
-
-
- 569 using namespace blas;
- 570 for (SizeType j = 0; j < tile_v_h.size().cols(); j += hhr_nb) {
- 571 const SizeType jb = std::min(hhr_nb, tile_v_h.size().cols() - j);
- 572 const SizeType ib = std::min(jb + b - 1, tile_v_h.size().rows() - j);
- 573 auto subtile_t = tile_t.subTileReference({{j, j}, {jb, jb}});
- 574 auto subtile_v = tile_v.subTileReference({{j, j}, {ib, jb}});
- 575 auto subtile_w = tile_w.subTileReference({{j, j}, {ib, jb}});
-
- 577 dlaf::tile::internal::trmm3(handle, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit,
- 578 T(1), subtile_t, subtile_v, subtile_w);
-
-
- 581 return std::make_tuple(std::move(tile_v), std::move(tile_w));
-
-
- 584 return ex::when_all(std::move(tile_v_h), std::move(tile_t_h),
- 585 splitTile(mat_v.
readwrite(ij), helper.specHH()),
- 586 splitTile(mat_t.
readwrite(ij_t), t_spec),
- 587 splitTile(mat_w.
readwrite(ij), helper.specHH())) |
- 588 dlaf::internal::transform<dlaf::internal::TransformDispatchType::Blas>(
-
-
-
-
-
-
-
-
-
-
-
-
- 601 template <Backend B, Device D,
class T>
-
- 603 Matrix<const T, Device::CPU>& mat_hh) {
- 604 using pika::execution::thread_priority;
- 605 namespace ex = pika::execution::experimental;
-
- 607 using common::iterate_range2d;
-
-
- 610 using namespace bt_tridiag;
-
- 612 if (mat_hh.size().isEmpty() || mat_e.size().isEmpty())
-
-
-
- 616 if (mat_hh.size().rows() <= (dlaf::isComplex_v<T> ? 1 : 2))
-
-
- 619 const SizeType b = band_size;
- 620 const SizeType group_size = getTuneParameters().bt_band_to_tridiag_hh_apply_group_size;
- 621 const SizeType nsweeps = nrSweeps<T>(mat_hh.size().cols());
-
- 623 const LocalTileSize tiles_per_block(mat_e.blockSize().rows() / b, 1);
-
-
- 626 const auto& dist_hh = mat_hh.distribution();
- 627 const auto& dist_e_rt = mat_e_rt.distribution();
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 644 const SizeType dist_w_rows = mat_e_rt.nrTiles().rows() * w_tile_sz.rows();
-
-
- 647 const matrix::Distribution dist_w2({b, mat_e_rt.size().cols()}, {b, mat_e_rt.blockSize().cols()});
-
- 649 constexpr std::size_t n_workspaces = 2;
- 650 RoundRobin<Panel<Coord::Col, T, D>> t_panels(n_workspaces, dist_t);
- 651 RoundRobin<Panel<Coord::Col, T, D>> v_panels(n_workspaces, dist_w);
- 652 RoundRobin<Panel<Coord::Col, T, D>> w_panels(n_workspaces, dist_w);
- 653 RoundRobin<Panel<Coord::Row, T, D>> w2_panels(n_workspaces, dist_w2);
-
- 655 HHManager<B, D, T> helperBackend(b, n_workspaces, dist_t, dist_w);
-
-
- 658 const SizeType j_last_sweep = (nsweeps - 1) / b;
- 659 for (SizeType j = j_last_sweep; j >= 0; --j) {
- 660 auto& mat_t = t_panels.nextResource();
- 661 auto& mat_v = v_panels.nextResource();
- 662 auto& mat_w = w_panels.nextResource();
- 663 auto& mat_w2 = w2_panels.nextResource();
-
-
- 666 const SizeType steps = nrStepsForSweep(j * b, mat_hh.size().cols(), b);
- 667 for (SizeType step = 0; step < steps; ++step) {
- 668 const SizeType i = j + step;
-
- 670 const GlobalElementIndex ij_el(i * b, j * b);
- 671 const LocalTileIndex ij(dist_hh.localTileIndex(dist_hh.globalTileIndex(ij_el)));
-
-
-
- 675 const SizeType nrefls = [&]() {
- 676 const bool allowSize1 = isComplex_v<T> && j == j_last_sweep && step == steps - 1;
- 677 const GlobalElementSize delta(dist_hh.size().rows() - ij_el.row() - 1,
- 678 std::min(b, dist_hh.size().cols() - ij_el.col()));
- 679 return std::min(b, std::min(delta.rows() - (allowSize1 ? 0 : 1), delta.cols()));
-
-
- 682 const TileAccessHelper helper(b, nrefls, dist_hh, dist_e_rt, ij_el);
-
-
- 685 mat_t.setWidth(nrefls);
- 686 mat_v.setWidth(nrefls);
- 687 mat_w.setWidth(nrefls);
- 688 mat_w2.setHeight(nrefls);
-
-
- 691 auto [tile_v_unshared, tile_w_unshared] =
- 692 helperBackend.computeVW(group_size, ij, helper,
- 693 splitTile(mat_hh.read(ij), helper.specHHCompact()), mat_v, mat_t,
-
- 695 auto tile_v = matrix::shareReadWriteTile(ex::make_unique_any_sender(std::move(tile_v_unshared)));
- 696 auto tile_w = matrix::shareReadWriteTile(ex::make_unique_any_sender(std::move(tile_w_unshared)));
-
- 698 for (SizeType j_e = 0; j_e < dist_e_rt.nrTiles().cols(); ++j_e) {
- 699 const auto idx_e = helper.topIndexE(j_e);
-
- 701 if (!helper.affectsMultipleTiles()) {
-
- 703 ex::when_all(ex::just(group_size), tile_v, tile_w,
- 704 mat_w2.readwrite(LocalTileIndex(0, j_e)), mat_e_rt.readwrite(idx_e)) |
- 705 dlaf::internal::transform<dlaf::internal::TransformDispatchType::Blas>(
-
-
-
-
- 710 ex::when_all(ex::just(group_size), tile_v, tile_w,
- 711 mat_w2.readwrite(LocalTileIndex(0, j_e)), mat_e_rt.readwrite(idx_e),
- 712 mat_e_rt.readwrite(helper.bottomIndexE(j_e))) |
- 713 dlaf::internal::transform<dlaf::internal::TransformDispatchType::Blas>(
-
-
-
-
-
-
-
-
-
-
-
-
- 726 template <Backend B, Device D,
class T>
- 727 void BackTransformationT2B<B, D, T>::call(comm::CommunicatorGrid grid,
const SizeType band_size,
- 728 Matrix<T, D>& mat_e, Matrix<const T, Device::CPU>& mat_hh) {
- 729 using pika::execution::thread_priority;
- 730 namespace ex = pika::execution::experimental;
-
- 732 using common::iterate_range2d;
- 733 using common::RoundRobin;
-
- 735 using namespace bt_tridiag;
-
- 737 if (mat_hh.size().isEmpty() || mat_e.size().isEmpty())
-
-
-
- 741 if (nrSweeps<T>(mat_hh.size().rows()) == 0)
-
-
- 744 const SizeType b = band_size;
- 745 const SizeType mb = mat_hh.blockSize().rows();
- 746 const SizeType group_size = getTuneParameters().bt_band_to_tridiag_hh_apply_group_size;
-
- 748 const LocalTileSize tiles_per_block(mat_e.blockSize().rows() / b, 1);
- 749 matrix::RetiledMatrix<T, D> mat_e_rt(mat_e, tiles_per_block);
-
- 751 const auto& dist_hh = mat_hh.distribution();
- 752 const auto& dist_e_rt = mat_e_rt.distribution();
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 767 const TileElementSize w_tile_sz(2 * b - 1, b);
-
- 769 const SizeType nlocal_ws =
- 770 std::max<SizeType>(1, dist_hh.localNrTiles().rows() * (util::ceilDiv<SizeType>(mb / b, 2) + 1));
- 771 const matrix::Distribution dist_ws_hh({nlocal_ws * b, b}, {b, b});
- 772 const matrix::Distribution dist_ws_v({nlocal_ws * w_tile_sz.rows(), w_tile_sz.cols()}, w_tile_sz);
- 773 const matrix::Distribution dist_ws_w2({nlocal_ws * b, mat_e_rt.size().cols()},
- 774 {b, mat_e_rt.blockSize().cols()});
-
- 776 constexpr std::size_t n_workspaces = 2;
-
- 778 RoundRobin<Panel<Coord::Col, T, D>> t_panels(n_workspaces, dist_ws_hh);
- 779 RoundRobin<Panel<Coord::Col, T, Device::CPU>> hh_panels(n_workspaces, dist_ws_hh);
-
- 781 RoundRobin<Panel<Coord::Col, T, D>> v_panels(n_workspaces, dist_ws_v);
- 782 RoundRobin<Panel<Coord::Col, T, D>> w_panels(n_workspaces, dist_ws_v);
-
- 784 RoundRobin<Panel<Coord::Row, T, D>> w2_panels(n_workspaces, dist_ws_w2);
- 785 RoundRobin<Panel<Coord::Row, T, D>> w2tmp_panels(n_workspaces, dist_ws_w2);
-
- 787 HHManager<B, D, T> helperBackend(b, n_workspaces, dist_ws_hh, dist_ws_v);
-
-
-
-
-
-
-
-
-
-
- 798 common::Pipeline<comm::Communicator> mpi_chain_row(grid.rowCommunicator().clone());
- 799 common::Pipeline<comm::Communicator> mpi_chain_col(grid.colCommunicator().clone());
- 800 const auto mpi_col_comm = ex::just(grid.colCommunicator().clone());
-
- 802 const SizeType idx_last_sweep_b = (nrSweeps<T>(mat_hh.size().cols()) - 1) / b;
- 803 const SizeType maxsteps_b = nrStepsForSweep(0, mat_hh.size().rows(), b);
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 833 for (SizeType k = idx_last_sweep_b; k > -maxsteps_b; --k) {
- 834 auto& mat_t = t_panels.nextResource();
- 835 auto& panel_hh = hh_panels.nextResource();
- 836 auto& mat_v = v_panels.nextResource();
- 837 auto& mat_w = w_panels.nextResource();
- 838 auto& mat_w2 = w2_panels.nextResource();
- 839 auto& mat_w2tmp = w2tmp_panels.nextResource();
-
- 841 for (SizeType i_b = std::abs<SizeType>(k), j_b = std::max<SizeType>(0, k);
- 842 i_b < j_b + nrStepsForSweep(j_b * b, mat_hh.size().cols(), b); i_b += 2, ++j_b) {
- 843 const SizeType step_b = i_b - j_b;
- 844 const GlobalElementIndex ij_el(i_b * b, j_b * b);
- 845 const GlobalTileIndex ij_g(dist_hh.globalTileIndex(ij_el));
-
- 847 const comm::Index2D rank = dist_hh.rankIndex();
- 848 const comm::Index2D rankHH = dist_hh.rankGlobalTile(ij_g);
-
-
-
- 852 const SizeType nrefls = [&]() {
- 853 const bool allowSize1 = isComplex_v<T> && j_b == idx_last_sweep_b &&
- 854 step_b == nrStepsForSweep(j_b * b, mat_hh.size().cols(), b) - 1;
- 855 const GlobalElementSize delta(dist_hh.size().rows() - ij_el.row() - 1,
- 856 std::min(b, dist_hh.size().cols() - ij_el.col()));
- 857 return std::min(b, std::min(delta.rows() - (allowSize1 ? 0 : 1), delta.cols()));
-
-
- 860 const TileAccessHelper helper(b, nrefls, dist_hh, dist_e_rt, ij_el);
- 861 const DistIndexing indexing_helper(helper, dist_hh, b, ij_g, i_b);
-
- 863 if (!indexing_helper.isInvolved())
-
-
-
- 867 mat_t.setWidth(nrefls);
- 868 mat_v.setWidth(nrefls);
- 869 mat_w.setWidth(nrefls);
- 870 mat_w2.setHeight(nrefls);
- 871 mat_w2tmp.setHeight(nrefls);
-
-
-
-
-
-
-
- 879 const LocalTileIndex ij_hh_panel = indexing_helper.wsIndexHH();
-
-
- 882 if (grid.size().cols() > 1 && rank.row() == rankHH.row()) {
- 883 if (rank.col() == rankHH.col()) {
- 884 ex::start_detached(comm::scheduleSendBcast(
- 885 mpi_chain_row(),
splitTile(mat_hh.read(ij_g), helper.specHHCompact())));
-
-
- 888 ex::start_detached(comm::scheduleRecvBcast(mpi_chain_row(), rankHH.col(),
- 889 splitTile(panel_hh.readwrite(ij_hh_panel),
- 890 helper.specHHCompact(
true))));
-
-
-
-
- 895 const SizeType ncols_local = dist_e_rt.localNrTiles().cols();
- 896 if (ncols_local == 0)
-
-
-
- 900 if (helper.affectsMultipleRanks()) {
- 901 const comm::IndexT_MPI rank_src = rankHH.row();
- 902 const comm::IndexT_MPI rank_dst = indexing_helper.rankRowPartner();
-
- 904 if (rank.row() == rank_src) {
- 905 auto tile_hh = rank.col() == rankHH.col()
- 906 ?
splitTile(mat_hh.read(ij_g), helper.specHHCompact())
- 907 :
splitTile(panel_hh.read(ij_hh_panel), helper.specHHCompact(true));
- 908 ex::start_detached(comm::scheduleSend(mpi_chain_col(), rank_dst, 0, std::move(tile_hh)));
-
- 910 else if (rank.row() == rank_dst) {
- 911 ex::start_detached(comm::scheduleRecv(mpi_chain_col(), rank_src, 0,
- 912 panel_hh.readwrite(ij_hh_panel)));
-
-
-
-
-
- 918 const SizeType current_group_size = helper.affectsMultipleRanks() ? b : group_size;
-
-
- 921 auto tile_hh = (rankHH == rank)
- 922 ?
splitTile(mat_hh.read(ij_g), helper.specHHCompact())
- 923 :
splitTile(panel_hh.read(ij_hh_panel), helper.specHHCompact(
true));
- 924 auto [tile_v_unshared, tile_w_unshared] =
- 925 helperBackend.computeVW(current_group_size, indexing_helper.wsIndexHH(), helper,
- 926 std::move(tile_hh), mat_v, mat_t, mat_w);
- 927 auto tile_v = matrix::shareReadWriteTile(ex::make_unique_any_sender(std::move(tile_v_unshared)));
- 928 auto tile_w = matrix::shareReadWriteTile(ex::make_unique_any_sender(std::move(tile_w_unshared)));
-
-
- 931 for (SizeType j_e = 0; j_e < ncols_local; ++j_e) {
- 932 const SizeType j_e_g = dist_e_rt.template globalTileFromLocalTile<Coord::Col>(j_e);
- 933 const LocalTileIndex idx_w2(indexing_helper.wsIndexHH().row(), j_e);
-
- 935 const GlobalTileIndex idx_e_top = helper.topIndexE(j_e_g);
- 936 const auto nb = mat_e_rt.tileSize(idx_e_top).cols();
-
-
- 939 if (!helper.affectsMultipleTiles()) {
- 940 ex::start_detached(ex::when_all(ex::just(current_group_size), tile_v, tile_w,
- 941 splitTile(mat_w2.readwrite(idx_w2), helper.specW2(nb)),
- 942 mat_e_rt.readwrite(idx_e_top)) |
- 943 dlaf::internal::transform<dlaf::internal::TransformDispatchType::Blas>(
-
- 945 ApplyHHToSingleTileRow<B, T>{}));
-
-
-
- 949 const GlobalTileIndex idx_e_bottom = helper.bottomIndexE(j_e_g);
-
-
- 952 if (!helper.affectsMultipleRanks()) {
-
- 954 ex::when_all(ex::just(current_group_size), tile_v, tile_w,
- 955 splitTile(mat_w2.readwrite(idx_w2), helper.specW2(nb)),
- 956 mat_e_rt.readwrite(idx_e_top), mat_e_rt.readwrite(idx_e_bottom)) |
- 957 dlaf::internal::transform<dlaf::internal::TransformDispatchType::Blas>(
-
-
-
-
- 962 const bool is_top_rank = rank.row() == rankHH.row();
- 963 const comm::IndexT_MPI rank_partner =
- 964 is_top_rank ? indexing_helper.rankRowPartner() : rankHH.row();
-
- 966 const comm::IndexT_MPI tag =
to_int(j_e + i_b * ncols_local);
-
- 968 const matrix::SubTileSpec spec_vw = helper.specHH(is_top_rank);
-
- 970 const auto idx_e = is_top_rank ? idx_e_top : idx_e_bottom;
- 971 const auto sz_e = mat_e_rt.tileSize(idx_e);
- 972 const matrix::SubTileSpec spec_e{{(is_top_rank ? 1 : 0), 0},
- 973 {sz_e.rows() - (is_top_rank ? 1 : 0), sz_e.cols()}};
-
- 975 auto subtile_v =
splitTile(tile_v, spec_vw);
- 976 auto subtile_w =
splitTile(tile_w, spec_vw);
- 977 auto subtile_e_ro =
splitTile(mat_e_rt.read(idx_e), spec_e);
-
-
-
- 981 dlaf::internal::whenAllLift(blas::Op::ConjTrans, blas::Op::NoTrans, T(1),
- 982 std::move(subtile_v), std::move(subtile_e_ro), T(0),
- 983 splitTile(mat_w2tmp.readwrite(idx_w2), helper.specW2(nb))) |
-
-
-
-
- 988 comm::scheduleAllSumP2P<B>(mpi_col_comm, rank_partner, tag,
- 989 splitTile(mat_w2tmp.read(idx_w2), helper.specW2(nb)),
- 990 splitTile(mat_w2.readwrite(idx_w2), helper.specW2(nb))));
-
- 992 auto subtile_e =
splitTile(mat_e_rt.readwrite(idx_e), spec_e);
-
-
- 995 dlaf::internal::whenAllLift(blas::Op::NoTrans, blas::Op::NoTrans, T(-1),
- 996 std::move(subtile_w),
- 997 splitTile(mat_w2.read(idx_w2), helper.specW2(nb)), T(1),
- 998 std::move(subtile_e)) |
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+ 43 #include <dlaf/sender/policy.h>
+ 44 #include <dlaf/sender/traits.h>
+ 45 #include <dlaf/sender/transform.h>
+ 46 #include <dlaf/sender/when_all_lift.h>
+
+
+
+
+
+ 52 namespace dlaf::eigensolver::internal {
+
+ 54 namespace bt_tridiag {
+
+
+ 57 matrix::Tile<T, Device::CPU> setupVWellFormed(
const SizeType b,
+ 58 const matrix::Tile<const T, Device::CPU>& tile_hh,
+ 59 matrix::Tile<T, Device::CPU> tile_v) {
+
+
+
+ 63 common::internal::SingleThreadedBlasScope single;
+
+
+
+
+ 68 const auto k = tile_v.size().cols();
+
+
+ 71 for (SizeType j = 0; j < k; ++j) {
+ 72 const auto compact_refl_size =
+ 73 std::min<SizeType>(tile_v.size().rows() - (1 + j), tile_hh.size().rows() - 1);
+
+
+ 76 if (compact_refl_size == 0)
+
+
+ 79 lacpy(blas::Uplo::General, compact_refl_size, 1, tile_hh.ptr({1, j}), tile_hh.ld(),
+ 80 tile_v.ptr({1 + j, j}), tile_v.ld());
+
+
+
+
+
+
+
+ 88 laset(blas::Uplo::Upper, tile_v.size().rows(), k, T(0), T(1), tile_v.ptr({0, 0}), tile_v.ld());
+
+ 90 if (tile_v.size().rows() > b)
+ 91 laset(blas::Uplo::Lower, tile_v.size().rows() - b, k - 1, T(0), T(0), tile_v.ptr({b, 0}),
+
+
+
+
+
+
+ 98 void computeTFactor(
const matrix::Tile<const T, Device::CPU>& tile_taus,
+ 99 const matrix::Tile<const T, Device::CPU>& tile_v,
+ 100 const matrix::Tile<T, Device::CPU>& tile_t) {
+ 101 using namespace lapack;
+
+ 103 common::internal::SingleThreadedBlasScope single;
+
+
+
+ 107 taus.resize(
to_sizet(tile_v.size().cols()));
+ 108 for (SizeType i = 0; i <
to_SizeType(taus.size()); ++i)
+ 109 taus[
to_sizet(i)] = tile_taus({0, i});
+
+ 111 const auto n = tile_v.size().rows();
+ 112 const auto k = tile_v.size().cols();
+ 113 larft(Direction::Forward, StoreV::Columnwise, n, k, tile_v.ptr(), tile_v.ld(), taus.data(),
+ 114 tile_t.ptr(), tile_t.ld());
+
+
+
+ 118 std::tuple<matrix::Tile<T, Device::CPU>, matrix::Tile<T, Device::CPU>> computeVT(
+ 119 const SizeType b,
const matrix::Tile<const T, Device::CPU>& tile_hh,
const SizeType hhr_nb,
+ 120 matrix::Tile<T, Device::CPU> tile_v, matrix::Tile<T, Device::CPU> tile_t) {
+ 121 auto tile_v2 = setupVWellFormed(b, tile_hh, std::move(tile_v));
+ 122 for (SizeType j = 0; j < tile_v2.size().cols(); j += hhr_nb) {
+ 123 const SizeType jb = std::min(hhr_nb, tile_v2.size().cols() - j);
+ 124 const SizeType ib = std::min(jb + b - 1, tile_v2.size().rows() - j);
+ 125 auto subtile_t = tile_t.subTileReference({{j, j}, {jb, jb}});
+ 126 auto subtile_hh = tile_hh.subTileReference({{0, j}, {1, jb}});
+ 127 auto subtile_v = tile_v2.subTileReference({{j, j}, {ib, jb}});
+ 128 computeTFactor(subtile_hh, subtile_v, subtile_t);
+
+ 130 return std::make_tuple(std::move(tile_v2), std::move(tile_t));
+
+
+
+ 134 std::tuple<matrix::Tile<T, Device::CPU>, matrix::Tile<T, Device::CPU>> computeVW(
+ 135 const SizeType b,
const matrix::Tile<const T, Device::CPU>& tile_hh,
const SizeType hhr_nb,
+ 136 matrix::Tile<T, Device::CPU> tile_v, matrix::Tile<T, Device::CPU> tile_t,
+ 137 matrix::Tile<T, Device::CPU> tile_w) {
+ 138 using namespace blas;
+
+ 140 auto [tile_v2, tile_t2] = computeVT(b, tile_hh, hhr_nb, std::move(tile_v), std::move(tile_t));
+
+ 142 for (SizeType j = 0; j < tile_v2.size().cols(); j += hhr_nb) {
+ 143 const SizeType jb = std::min(hhr_nb, tile_v2.size().cols() - j);
+ 144 const SizeType ib = std::min(jb + b - 1, tile_v2.size().rows() - j);
+ 145 auto subtile_t = tile_t2.subTileReference({{j, j}, {jb, jb}});
+ 146 auto subtile_v = tile_v2.subTileReference({{j, j}, {ib, jb}});
+ 147 auto subtile_w = tile_w.subTileReference({{j, j}, {ib, jb}});
+
+ 149 dlaf::tile::internal::trmm3(Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, T(1), subtile_t,
+ 150 subtile_v, subtile_w);
+
+ 152 return std::make_tuple(std::move(tile_v2), std::move(tile_w));
+
+
+ 155 template <
class Tile,
class CTile>
+ 156 std::tuple<CTile, CTile, Tile, Tile> applyHHToSingleTileRowSubtileHelper(
+ 157 const SizeType j,
const SizeType jb,
const CTile& tile_v,
const CTile& tile_w,
const Tile& tile_w2,
+ 158 const Tile& tile_e) {
+ 159 DLAF_ASSERT_HEAVY(tile_v.size() == tile_w.size(), tile_v, tile_w);
+ 160 DLAF_ASSERT_HEAVY(tile_e.size().rows() - 1 == tile_v.size().rows(), tile_e, tile_v);
+ 161 DLAF_ASSERT_HEAVY(tile_e.size().cols() == tile_w2.size().cols(), tile_e, tile_w2);
+
+ 163 const SizeType ib = tile_v.size().rows() - j;
+ 164 auto subtile_v = tile_v.subTileReference({{j, j}, {ib, jb}});
+ 165 auto subtile_w = tile_w.subTileReference({{j, j}, {ib, jb}});
+ 166 auto subtile_w2 = tile_w2.subTileReference({{0, 0}, {jb, tile_w2.size().cols()}});
+ 167 auto subtile_e = tile_e.subTileReference({{j + 1, 0}, tile_e.size() - TileElementSize{j + 1, 0}});
+
+ 169 return {std::move(subtile_v), std::move(subtile_w), std::move(subtile_w2), std::move(subtile_e)};
+
+
+ 172 template <Backend B,
class T>
+
+
+
+
+
+
+
+
+ 181 using namespace blas;
+ 182 using tile::internal::gemm;
+
+
+
+ 186 for (SizeType j = (util::ceilDiv(tile_v.size().cols(), hhr_nb) - 1) * hhr_nb; j >= 0; j -= hhr_nb) {
+ 187 const SizeType jb = std::min(hhr_nb, tile_v.size().cols() - j);
+ 188 auto [subtile_v, subtile_w, subtile_w2, subtile_e] =
+ 189 applyHHToSingleTileRowSubtileHelper(j, jb, tile_v, tile_w, tile_w2, tile_e);
+
+
+ 192 gemm(Op::ConjTrans, Op::NoTrans, T(1), subtile_v, subtile_e, T(0), subtile_w2);
+
+ 194 gemm(Op::NoTrans, Op::NoTrans, T(-1), subtile_w, subtile_w2, T(1), subtile_e);
+
+
+
+
+
+
+
+ 202 void operator()(cublasHandle_t handle,
const SizeType hhr_nb,
+
+
+
+
+ 207 using namespace blas;
+ 208 using tile::internal::gemm;
+
+
+
+ 212 for (SizeType j = (util::ceilDiv(tile_v.size().cols(), hhr_nb) - 1) * hhr_nb; j >= 0; j -= hhr_nb) {
+ 213 const SizeType jb = std::min(hhr_nb, tile_v.size().cols() - j);
+ 214 auto [subtile_v, subtile_w, subtile_w2, subtile_e] =
+ 215 applyHHToSingleTileRowSubtileHelper(j, jb, tile_v, tile_w, tile_w2, tile_e);
+
+
+ 218 gemm(handle, Op::ConjTrans, Op::NoTrans, T(1), subtile_v, subtile_e, T(0), subtile_w2);
+
+ 220 gemm(handle, Op::NoTrans, Op::NoTrans, T(-1), subtile_w, subtile_w2, T(1), subtile_e);
+
+
+
+
+
+ 226 template <
class Tile,
class CTile>
+ 227 std::tuple<CTile, CTile, CTile, CTile, Tile, Tile, Tile> applyHHToDoubleTileRowSubtileHelper(
+ 228 const SizeType j,
const SizeType jb,
const CTile& tile_v,
const CTile& tile_w,
const Tile& tile_w2,
+ 229 const Tile& tile_e_top,
const Tile& tile_e_bottom) {
+ 230 DLAF_ASSERT_HEAVY(tile_v.size() == tile_w.size(), tile_v, tile_w);
+ 231 DLAF_ASSERT_HEAVY(tile_e_top.size().rows() + tile_e_bottom.size().rows() - 1 == tile_v.size().rows(),
+ 232 tile_e_top, tile_e_bottom, tile_v);
+ 233 DLAF_ASSERT_HEAVY(tile_e_top.size().cols() == tile_w2.size().cols(), tile_e_top, tile_w2);
+ 234 DLAF_ASSERT_HEAVY(tile_e_bottom.size().cols() == tile_w2.size().cols(), tile_e_bottom, tile_w2);
+
+
+ 237 tile_e_top.subTileReference({{j + 1, 0}, tile_e_top.size() -
TileElementSize{j + 1, 0}});
+ 238 auto subtile_e_bottom = tile_e_bottom.subTileReference(
+
+ 240 TileElementSize{std::min(tile_e_bottom.size().rows(), j + jb), tile_e_bottom.size().cols()}});
+
+ 242 matrix::SubTileSpec spec_top{{j, j}, {subtile_e_top.size().rows(), jb}};
+ 243 matrix::SubTileSpec spec_bottom{{tile_e_top.size().rows() - 1, j},
+ 244 {subtile_e_bottom.size().rows(), jb}};
+
+ 246 auto subtile_v_top = tile_v.subTileReference(spec_top);
+ 247 auto subtile_v_bottom = tile_v.subTileReference(spec_bottom);
+ 248 auto subtile_w_top = tile_w.subTileReference(spec_top);
+ 249 auto subtile_w_bottom = tile_w.subTileReference(spec_bottom);
+ 250 auto subtile_w2 = tile_w2.subTileReference({{0, 0}, {jb, tile_w2.size().cols()}});
+
+ 252 return {std::move(subtile_v_top), std::move(subtile_v_bottom), std::move(subtile_w_top),
+ 253 std::move(subtile_w_bottom), std::move(subtile_w2), std::move(subtile_e_top),
+ 254 std::move(subtile_e_bottom)};
+
+
+ 257 template <Backend B,
class T>
+
+
+
+
+
+
+
+
+
+ 267 using namespace blas;
+ 268 using tile::internal::gemm;
+
+
+
+ 272 for (SizeType j = (util::ceilDiv(tile_v.size().cols(), hhr_nb) - 1) * hhr_nb; j >= 0; j -= hhr_nb) {
+ 273 const SizeType jb = std::min(hhr_nb, tile_v.size().cols() - j);
+ 274 auto [subtile_v_top, subtile_v_bottom, subtile_w_top, subtile_w_bottom, subtile_w2, subtile_e_top,
+
+ 276 applyHHToDoubleTileRowSubtileHelper(j, jb, tile_v, tile_w, tile_w2, tile_e_top, tile_e_bottom);
+
+
+ 279 gemm(Op::ConjTrans, Op::NoTrans, T(1), subtile_v_top, subtile_e_top, T(0), subtile_w2);
+ 280 gemm(Op::ConjTrans, Op::NoTrans, T(1), subtile_v_bottom, subtile_e_bottom, T(1), subtile_w2);
+
+ 282 gemm(Op::NoTrans, Op::NoTrans, T(-1), subtile_w_top, subtile_w2, T(1), subtile_e_top);
+ 283 gemm(Op::NoTrans, Op::NoTrans, T(-1), subtile_w_bottom, subtile_w2, T(1), subtile_e_bottom);
+
+
+
+
+
+
+
+ 291 void operator()(cublasHandle_t handle,
const SizeType hhr_nb,
+
+
+
+
+
+ 297 using namespace blas;
+ 298 using tile::internal::gemm;
+
+
+
+ 302 for (SizeType j = (util::ceilDiv(tile_v.size().cols(), hhr_nb) - 1) * hhr_nb; j >= 0; j -= hhr_nb) {
+ 303 const SizeType jb = std::min(hhr_nb, tile_v.size().cols() - j);
+ 304 auto [subtile_v_top, subtile_v_bottom, subtile_w_top, subtile_w_bottom, subtile_w2, subtile_e_top,
+
+ 306 applyHHToDoubleTileRowSubtileHelper(j, jb, tile_v, tile_w, tile_w2, tile_e_top, tile_e_bottom);
+
+
+ 309 gemm(handle, Op::ConjTrans, Op::NoTrans, T(1), subtile_v_top, subtile_e_top, T(0), subtile_w2);
+ 310 gemm(handle, Op::ConjTrans, Op::NoTrans, T(1), subtile_v_bottom, subtile_e_bottom, T(1),
+
+
+ 313 gemm(handle, Op::NoTrans, Op::NoTrans, T(-1), subtile_w_top, subtile_w2, T(1), subtile_e_top);
+ 314 gemm(handle, Op::NoTrans, Op::NoTrans, T(-1), subtile_w_bottom, subtile_w2, T(1),
+
+
+
+
+
+
+
+
+
+
+ 325 {std::min(b, dist_hh.size().rows() - offset.row()),
+ 326 std::min(b, dist_hh.size().cols() - offset.col())}},
+
+
+
+
+
+
+
+
+
+ 336 rows_v_ = std::min(2 * b, dist_e.size().rows() - offset.row()) - 1;
+ 337 rows_v_top_ = std::min(rows_v_, b - 1);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 362 across_tiles_ = !(index_e_ == dist_e.
nrTiles().rows() - 1);
+
+ 364 across_ranks_ = across_tiles_ && (dist_e.
rankGlobalTile<Coord::Row>(index_e_) !=
+
+
+
+
+ 369 bool affectsMultipleTiles()
const noexcept {
+ 370 return across_tiles_;
+
+
+
+ 374 bool affectsMultipleRanks()
const noexcept {
+ 375 return across_ranks_;
+
+
+
+
+
+
+
+
+
+
+ 386 return {{0, 0}, input_spec_.size};
+
+
+
+
+
+ 392 return {{0, 0}, {rows_v_, nrefls_}};
+
+
+
+
+
+
+ 399 return {{0, 0}, {rows_v_top_, nrefls_}};
+ 400 DLAF_ASSERT_MODERATE(affectsMultipleTiles(), affectsMultipleTiles());
+ 401 return {{rows_v_top_, 0}, {rows_v_ - rows_v_top_, nrefls_}};
+
+
+
+
+ 406 return {{0, 0}, {nrefls_, nrefls_}};
+
+
+
+ 410 return {{0, 0}, {nrefls_, cols}};
+
+
+
+ 414 return {index_e_, j};
+
+
+
+ 418 DLAF_ASSERT_MODERATE(affectsMultipleTiles(), affectsMultipleTiles());
+ 419 return {index_e_ + 1, j};
+
+
+
+
+
+
+
+ 427 SizeType rows_v_top_;
+
+
+
+
+
+
+
+
+ 448 : dist_hh(dist_hh), b(b), mb(dist_hh.blockSize().rows()), helper(helper), ij(ij),
+
+ 450 rank = dist_hh.rankIndex();
+
+ 452 n_ws_per_block =
to_SizeType(
static_cast<size_t>(std::ceil(mb / b / 2.0f)) + 1);
+
+
+
+ 456 return (rankHH.row() + 1) % dist_hh.commGridSize().rows();
+
+
+ 459 bool isInvolved()
const {
+ 460 const bool isSameRow = rank.row() == rankHH.row();
+ 461 const bool isPartnerRow = rank.row() == rankRowPartner();
+ 462 return isSameRow || (isPartnerRow && helper.affectsMultipleRanks());
+
+
+
+ 466 const SizeType row = [&]() -> SizeType {
+ 467 if (rank.row() == rankHH.row()) {
+
+ 469 const SizeType intra_idx = 1 + (ij_b_row % (mb / b)) / 2;
+ 470 DLAF_ASSERT_HEAVY(intra_idx < n_ws_per_block, intra_idx, n_ws_per_block);
+
+
+
+ 474 DLAF_ASSERT_HEAVY(helper.affectsMultipleRanks() && (rank.row() == rankRowPartner()),
+ 475 helper.affectsMultipleRanks(), rank.row(), rankRowPartner());
+
+
+
+
+
+
+
+
+
+
+
+
+ 488 SizeType n_ws_per_block;
+
+
+
+
+
+
+
+
+
+
+ 499 template <Backend B, Device D,
class T>
+
+
+
+
+ 504 static constexpr Backend B = Backend::MC;
+ 505 static constexpr Device D = Device::CPU;
+
+
+
+ 509 template <
class SenderHH>
+
+
+
+ 513 namespace ex = pika::execution::experimental;
+
+ 515 return dlaf::internal::whenAllLift(b, std::forward<SenderHH>(tile_hh), nb_apply,
+ 516 splitTile(mat_v.
readwrite(ij), helper.specHH()),
+ 517 splitTile(mat_t.
readwrite(ij), helper.specT()),
+ 518 splitTile(mat_w.
readwrite(ij), helper.specHH())) |
+
+
+
+
+
+
+
+
+
+
+
+ 530 static constexpr Backend B = Backend::GPU;
+ 531 static constexpr Device D = Device::GPU;
+
+
+
+ 535 : b(b), t_panels_h(n_workspaces, dist_t), w_panels_h(n_workspaces, dist_w) {}
+
+ 537 template <
class SenderHH>
+
+
+
+ 541 namespace ex = pika::execution::experimental;
+
+ 543 auto& mat_v_h = w_panels_h.nextResource();
+ 544 auto& mat_t_h = t_panels_h.nextResource();
+
+
+
+
+ 549 auto [tile_v_h, tile_t_h] =
+ 550 dlaf::internal::whenAllLift(b, std::forward<SenderHH>(tile_hh), hhr_nb,
+ 551 splitTile(mat_v_h.readwrite(ij), helper.specHH()),
+ 552 splitTile(mat_t_h.readwrite(ij_t), t_spec)) |
+
+
+
+ 556 auto copyVTandComputeW =
+
+
+
+
+ 561 whip::stream_t stream;
+ 562 DLAF_GPUBLAS_CHECK_ERROR(cublasGetStream(handle, &stream));
+
+ 564 matrix::internal::copy(tile_v_h, tile_v, stream);
+ 565 matrix::internal::copy(tile_t_h, tile_t, stream);
+
+
+ 568 using namespace blas;
+ 569 for (SizeType j = 0; j < tile_v_h.size().cols(); j += hhr_nb) {
+ 570 const SizeType jb = std::min(hhr_nb, tile_v_h.size().cols() - j);
+ 571 const SizeType ib = std::min(jb + b - 1, tile_v_h.size().rows() - j);
+ 572 auto subtile_t = tile_t.subTileReference({{j, j}, {jb, jb}});
+ 573 auto subtile_v = tile_v.subTileReference({{j, j}, {ib, jb}});
+ 574 auto subtile_w = tile_w.subTileReference({{j, j}, {ib, jb}});
+
+ 576 dlaf::tile::internal::trmm3(handle, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit,
+ 577 T(1), subtile_t, subtile_v, subtile_w);
+
+
+ 580 return std::make_tuple(std::move(tile_v), std::move(tile_w));
+
+
+ 583 return ex::when_all(std::move(tile_v_h), std::move(tile_t_h),
+ 584 splitTile(mat_v.
readwrite(ij), helper.specHH()),
+ 585 splitTile(mat_t.
readwrite(ij_t), t_spec),
+ 586 splitTile(mat_w.
readwrite(ij), helper.specHH())) |
+ 587 dlaf::internal::transform<dlaf::internal::TransformDispatchType::Blas>(
+
+
+
+
+
+
+
+
+
+
+
+
+ 600 template <Backend B, Device D,
class T>
+
+ 602 Matrix<const T, Device::CPU>& mat_hh) {
+ 603 using pika::execution::thread_priority;
+ 604 namespace ex = pika::execution::experimental;
+
+ 606 using common::iterate_range2d;
+
+
+ 609 using namespace bt_tridiag;
+
+ 611 if (mat_hh.size().isEmpty() || mat_e.size().isEmpty())
+
+
+
+ 615 if (mat_hh.size().rows() <= (dlaf::isComplex_v<T> ? 1 : 2))
+
+
+ 618 const SizeType b = band_size;
+ 619 const SizeType group_size = getTuneParameters().bt_band_to_tridiag_hh_apply_group_size;
+ 620 const SizeType nsweeps = nrSweeps<T>(mat_hh.size().cols());
+
+ 622 const LocalTileSize tiles_per_block(mat_e.blockSize().rows() / b, 1);
+ 623 Matrix<T, D> mat_e_rt = mat_e.retiledSubPipeline(tiles_per_block);
+
+ 625 const auto& dist_hh = mat_hh.distribution();
+ 626 const auto& dist_e_rt = mat_e_rt.distribution();
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 643 const SizeType dist_w_rows = mat_e_rt.nrTiles().rows() * w_tile_sz.rows();
+
+
+ 646 const matrix::Distribution dist_w2({b, mat_e_rt.size().cols()}, {b, mat_e_rt.blockSize().cols()});
+
+ 648 constexpr std::size_t n_workspaces = 2;
+ 649 RoundRobin<Panel<Coord::Col, T, D>> t_panels(n_workspaces, dist_t);
+ 650 RoundRobin<Panel<Coord::Col, T, D>> v_panels(n_workspaces, dist_w);
+ 651 RoundRobin<Panel<Coord::Col, T, D>> w_panels(n_workspaces, dist_w);
+ 652 RoundRobin<Panel<Coord::Row, T, D>> w2_panels(n_workspaces, dist_w2);
+
+ 654 HHManager<B, D, T> helperBackend(b, n_workspaces, dist_t, dist_w);
+
+
+ 657 const SizeType j_last_sweep = (nsweeps - 1) / b;
+ 658 for (SizeType j = j_last_sweep; j >= 0; --j) {
+ 659 auto& mat_t = t_panels.nextResource();
+ 660 auto& mat_v = v_panels.nextResource();
+ 661 auto& mat_w = w_panels.nextResource();
+ 662 auto& mat_w2 = w2_panels.nextResource();
+
+
+ 665 const SizeType steps = nrStepsForSweep(j * b, mat_hh.size().cols(), b);
+ 666 for (SizeType step = 0; step < steps; ++step) {
+ 667 const SizeType i = j + step;
+
+ 669 const GlobalElementIndex ij_el(i * b, j * b);
+ 670 const LocalTileIndex ij(dist_hh.localTileIndex(dist_hh.globalTileIndex(ij_el)));
+
+
+
+ 674 const SizeType nrefls = [&]() {
+ 675 const bool allowSize1 = isComplex_v<T> && j == j_last_sweep && step == steps - 1;
+ 676 const GlobalElementSize delta(dist_hh.size().rows() - ij_el.row() - 1,
+ 677 std::min(b, dist_hh.size().cols() - ij_el.col()));
+ 678 return std::min(b, std::min(delta.rows() - (allowSize1 ? 0 : 1), delta.cols()));
+
+
+ 681 const TileAccessHelper helper(b, nrefls, dist_hh, dist_e_rt, ij_el);
+
+
+ 684 mat_t.setWidth(nrefls);
+ 685 mat_v.setWidth(nrefls);
+ 686 mat_w.setWidth(nrefls);
+ 687 mat_w2.setHeight(nrefls);
+
+
+ 690 auto [tile_v_unshared, tile_w_unshared] =
+ 691 helperBackend.computeVW(group_size, ij, helper,
+ 692 splitTile(mat_hh.read(ij), helper.specHHCompact()), mat_v, mat_t,
+
+ 694 auto tile_v = matrix::shareReadWriteTile(ex::make_unique_any_sender(std::move(tile_v_unshared)));
+ 695 auto tile_w = matrix::shareReadWriteTile(ex::make_unique_any_sender(std::move(tile_w_unshared)));
+
+ 697 for (SizeType j_e = 0; j_e < dist_e_rt.nrTiles().cols(); ++j_e) {
+ 698 const auto idx_e = helper.topIndexE(j_e);
+
+ 700 if (!helper.affectsMultipleTiles()) {
+
+ 702 ex::when_all(ex::just(group_size), tile_v, tile_w,
+ 703 mat_w2.readwrite(LocalTileIndex(0, j_e)), mat_e_rt.readwrite(idx_e)) |
+ 704 dlaf::internal::transform<dlaf::internal::TransformDispatchType::Blas>(
+
+
+
+
+ 709 ex::when_all(ex::just(group_size), tile_v, tile_w,
+ 710 mat_w2.readwrite(LocalTileIndex(0, j_e)), mat_e_rt.readwrite(idx_e),
+ 711 mat_e_rt.readwrite(helper.bottomIndexE(j_e))) |
+ 712 dlaf::internal::transform<dlaf::internal::TransformDispatchType::Blas>(
+
+
+
+
+
+
+
+
+
+
+
+
+ 725 template <Backend B, Device D,
class T>
+ 726 void BackTransformationT2B<B, D, T>::call(comm::CommunicatorGrid grid,
const SizeType band_size,
+ 727 Matrix<T, D>& mat_e, Matrix<const T, Device::CPU>& mat_hh) {
+ 728 using pika::execution::thread_priority;
+ 729 namespace ex = pika::execution::experimental;
+
+ 731 using common::iterate_range2d;
+ 732 using common::RoundRobin;
+
+ 734 using namespace bt_tridiag;
+
+ 736 if (mat_hh.size().isEmpty() || mat_e.size().isEmpty())
+
+
+
+ 740 if (nrSweeps<T>(mat_hh.size().rows()) == 0)
+
+
+ 743 const SizeType b = band_size;
+ 744 const SizeType mb = mat_hh.blockSize().rows();
+ 745 const SizeType group_size = getTuneParameters().bt_band_to_tridiag_hh_apply_group_size;
+
+ 747 const LocalTileSize tiles_per_block(mat_e.blockSize().rows() / b, 1);
+ 748 Matrix<T, D> mat_e_rt = mat_e.retiledSubPipeline(tiles_per_block);
+
+ 750 const auto& dist_hh = mat_hh.distribution();
+ 751 const auto& dist_e_rt = mat_e_rt.distribution();
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 766 const TileElementSize w_tile_sz(2 * b - 1, b);
+
+ 768 const SizeType nlocal_ws =
+ 769 std::max<SizeType>(1, dist_hh.localNrTiles().rows() * (util::ceilDiv<SizeType>(mb / b, 2) + 1));
+ 770 const matrix::Distribution dist_ws_hh({nlocal_ws * b, b}, {b, b});
+ 771 const matrix::Distribution dist_ws_v({nlocal_ws * w_tile_sz.rows(), w_tile_sz.cols()}, w_tile_sz);
+ 772 const matrix::Distribution dist_ws_w2({nlocal_ws * b, mat_e_rt.size().cols()},
+ 773 {b, mat_e_rt.blockSize().cols()});
+
+ 775 constexpr std::size_t n_workspaces = 2;
+
+ 777 RoundRobin<Panel<Coord::Col, T, D>> t_panels(n_workspaces, dist_ws_hh);
+ 778 RoundRobin<Panel<Coord::Col, T, Device::CPU>> hh_panels(n_workspaces, dist_ws_hh);
+
+ 780 RoundRobin<Panel<Coord::Col, T, D>> v_panels(n_workspaces, dist_ws_v);
+ 781 RoundRobin<Panel<Coord::Col, T, D>> w_panels(n_workspaces, dist_ws_v);
+
+ 783 RoundRobin<Panel<Coord::Row, T, D>> w2_panels(n_workspaces, dist_ws_w2);
+ 784 RoundRobin<Panel<Coord::Row, T, D>> w2tmp_panels(n_workspaces, dist_ws_w2);
+
+ 786 HHManager<B, D, T> helperBackend(b, n_workspaces, dist_ws_hh, dist_ws_v);
+
+
+
+
+
+
+
+
+
+
+ 797 common::Pipeline<comm::Communicator> mpi_chain_row(grid.rowCommunicator().clone());
+ 798 common::Pipeline<comm::Communicator> mpi_chain_col(grid.colCommunicator().clone());
+ 799 const auto mpi_col_comm = ex::just(grid.colCommunicator().clone());
+
+ 801 const SizeType idx_last_sweep_b = (nrSweeps<T>(mat_hh.size().cols()) - 1) / b;
+ 802 const SizeType maxsteps_b = nrStepsForSweep(0, mat_hh.size().rows(), b);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 832 for (SizeType k = idx_last_sweep_b; k > -maxsteps_b; --k) {
+ 833 auto& mat_t = t_panels.nextResource();
+ 834 auto& panel_hh = hh_panels.nextResource();
+ 835 auto& mat_v = v_panels.nextResource();
+ 836 auto& mat_w = w_panels.nextResource();
+ 837 auto& mat_w2 = w2_panels.nextResource();
+ 838 auto& mat_w2tmp = w2tmp_panels.nextResource();
+
+ 840 for (SizeType i_b = std::abs<SizeType>(k), j_b = std::max<SizeType>(0, k);
+ 841 i_b < j_b + nrStepsForSweep(j_b * b, mat_hh.size().cols(), b); i_b += 2, ++j_b) {
+ 842 const SizeType step_b = i_b - j_b;
+ 843 const GlobalElementIndex ij_el(i_b * b, j_b * b);
+ 844 const GlobalTileIndex ij_g(dist_hh.globalTileIndex(ij_el));
+
+ 846 const comm::Index2D rank = dist_hh.rankIndex();
+ 847 const comm::Index2D rankHH = dist_hh.rankGlobalTile(ij_g);
+
+
+
+ 851 const SizeType nrefls = [&]() {
+ 852 const bool allowSize1 = isComplex_v<T> && j_b == idx_last_sweep_b &&
+ 853 step_b == nrStepsForSweep(j_b * b, mat_hh.size().cols(), b) - 1;
+ 854 const GlobalElementSize delta(dist_hh.size().rows() - ij_el.row() - 1,
+ 855 std::min(b, dist_hh.size().cols() - ij_el.col()));
+ 856 return std::min(b, std::min(delta.rows() - (allowSize1 ? 0 : 1), delta.cols()));
+
+
+ 859 const TileAccessHelper helper(b, nrefls, dist_hh, dist_e_rt, ij_el);
+ 860 const DistIndexing indexing_helper(helper, dist_hh, b, ij_g, i_b);
+
+ 862 if (!indexing_helper.isInvolved())
+
+
+
+ 866 mat_t.setWidth(nrefls);
+ 867 mat_v.setWidth(nrefls);
+ 868 mat_w.setWidth(nrefls);
+ 869 mat_w2.setHeight(nrefls);
+ 870 mat_w2tmp.setHeight(nrefls);
+
+
+
+
+
+
+
+ 878 const LocalTileIndex ij_hh_panel = indexing_helper.wsIndexHH();
+
+
+ 881 if (grid.size().cols() > 1 && rank.row() == rankHH.row()) {
+ 882 if (rank.col() == rankHH.col()) {
+ 883 ex::start_detached(comm::scheduleSendBcast(
+ 884 mpi_chain_row(),
splitTile(mat_hh.read(ij_g), helper.specHHCompact())));
+
+
+ 887 ex::start_detached(comm::scheduleRecvBcast(mpi_chain_row(), rankHH.col(),
+ 888 splitTile(panel_hh.readwrite(ij_hh_panel),
+ 889 helper.specHHCompact(
true))));
+
+
+
+
+ 894 const SizeType ncols_local = dist_e_rt.localNrTiles().cols();
+ 895 if (ncols_local == 0)
+
+
+
+ 899 if (helper.affectsMultipleRanks()) {
+ 900 const comm::IndexT_MPI rank_src = rankHH.row();
+ 901 const comm::IndexT_MPI rank_dst = indexing_helper.rankRowPartner();
+
+ 903 if (rank.row() == rank_src) {
+ 904 auto tile_hh = rank.col() == rankHH.col()
+ 905 ?
splitTile(mat_hh.read(ij_g), helper.specHHCompact())
+ 906 :
splitTile(panel_hh.read(ij_hh_panel), helper.specHHCompact(true));
+ 907 ex::start_detached(comm::scheduleSend(mpi_chain_col(), rank_dst, 0, std::move(tile_hh)));
+
+ 909 else if (rank.row() == rank_dst) {
+ 910 ex::start_detached(comm::scheduleRecv(mpi_chain_col(), rank_src, 0,
+ 911 panel_hh.readwrite(ij_hh_panel)));
+
+
+
+
+
+ 917 const SizeType current_group_size = helper.affectsMultipleRanks() ? b : group_size;
+
+
+ 920 auto tile_hh = (rankHH == rank)
+ 921 ?
splitTile(mat_hh.read(ij_g), helper.specHHCompact())
+ 922 :
splitTile(panel_hh.read(ij_hh_panel), helper.specHHCompact(
true));
+ 923 auto [tile_v_unshared, tile_w_unshared] =
+ 924 helperBackend.computeVW(current_group_size, indexing_helper.wsIndexHH(), helper,
+ 925 std::move(tile_hh), mat_v, mat_t, mat_w);
+ 926 auto tile_v = matrix::shareReadWriteTile(ex::make_unique_any_sender(std::move(tile_v_unshared)));
+ 927 auto tile_w = matrix::shareReadWriteTile(ex::make_unique_any_sender(std::move(tile_w_unshared)));
+
+
+ 930 for (SizeType j_e = 0; j_e < ncols_local; ++j_e) {
+ 931 const SizeType j_e_g = dist_e_rt.template globalTileFromLocalTile<Coord::Col>(j_e);
+ 932 const LocalTileIndex idx_w2(indexing_helper.wsIndexHH().row(), j_e);
+
+ 934 const GlobalTileIndex idx_e_top = helper.topIndexE(j_e_g);
+ 935 const auto nb = mat_e_rt.tileSize(idx_e_top).cols();
+
+
+ 938 if (!helper.affectsMultipleTiles()) {
+ 939 ex::start_detached(ex::when_all(ex::just(current_group_size), tile_v, tile_w,
+ 940 splitTile(mat_w2.readwrite(idx_w2), helper.specW2(nb)),
+ 941 mat_e_rt.readwrite(idx_e_top)) |
+ 942 dlaf::internal::transform<dlaf::internal::TransformDispatchType::Blas>(
+
+ 944 ApplyHHToSingleTileRow<B, T>{}));
+
+
+
+ 948 const GlobalTileIndex idx_e_bottom = helper.bottomIndexE(j_e_g);
+
+
+ 951 if (!helper.affectsMultipleRanks()) {
+
+ 953 ex::when_all(ex::just(current_group_size), tile_v, tile_w,
+ 954 splitTile(mat_w2.readwrite(idx_w2), helper.specW2(nb)),
+ 955 mat_e_rt.readwrite(idx_e_top), mat_e_rt.readwrite(idx_e_bottom)) |
+ 956 dlaf::internal::transform<dlaf::internal::TransformDispatchType::Blas>(
+
+
+
+
+ 961 const bool is_top_rank = rank.row() == rankHH.row();
+ 962 const comm::IndexT_MPI rank_partner =
+ 963 is_top_rank ? indexing_helper.rankRowPartner() : rankHH.row();
+
+ 965 const comm::IndexT_MPI tag =
to_int(j_e + i_b * ncols_local);
+
+ 967 const matrix::SubTileSpec spec_vw = helper.specHH(is_top_rank);
+
+ 969 const auto idx_e = is_top_rank ? idx_e_top : idx_e_bottom;
+ 970 const auto sz_e = mat_e_rt.tileSize(idx_e);
+ 971 const matrix::SubTileSpec spec_e{{(is_top_rank ? 1 : 0), 0},
+ 972 {sz_e.rows() - (is_top_rank ? 1 : 0), sz_e.cols()}};
+
+ 974 auto subtile_v =
splitTile(tile_v, spec_vw);
+ 975 auto subtile_w =
splitTile(tile_w, spec_vw);
+ 976 auto subtile_e_ro =
splitTile(mat_e_rt.read(idx_e), spec_e);
+
+
+
+ 980 dlaf::internal::whenAllLift(blas::Op::ConjTrans, blas::Op::NoTrans, T(1),
+ 981 std::move(subtile_v), std::move(subtile_e_ro), T(0),
+ 982 splitTile(mat_w2tmp.readwrite(idx_w2), helper.specW2(nb))) |
+
+
+
+
+ 987 comm::scheduleAllSumP2P<B>(mpi_col_comm, rank_partner, tag,
+ 988 splitTile(mat_w2tmp.read(idx_w2), helper.specW2(nb)),
+ 989 splitTile(mat_w2.readwrite(idx_w2), helper.specW2(nb))));
+
+ 991 auto subtile_e =
splitTile(mat_e_rt.readwrite(idx_e), spec_e);
+
+
+ 994 dlaf::internal::whenAllLift(blas::Op::NoTrans, blas::Op::NoTrans, T(-1),
+ 995 std::move(subtile_w),
+ 996 splitTile(mat_w2.read(idx_w2), helper.specW2(nb)), T(1),
+ 997 std::move(subtile_e)) |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
@@ -1083,7 +1082,6 @@
const GlobalTileSize & nrTiles() const noexcept
Returns the number of tiles of the global matrix (2D size).
Definition: distribution.h:142
comm::Index2D rankGlobalTile(const GlobalTileIndex &global_tile) const noexcept
Definition: distribution.h:210
SizeType globalTileFromGlobalElement(SizeType global_element) const noexcept
Definition: distribution.h:292
-Definition: retiled_matrix.h:36
int IndexT_MPI
Type used for indexes in MPI API.
Definition: communicator.h:23
@@ -1101,16 +1099,15 @@
-
Definition: round_robin.h:20
-
-
-
-
-
+
+
+
+
+
ReadWriteSenderType readwrite(LocalTileIndex index)
Definition: panel.h:570
Contains the information to create a subtile.
Definition: tile.h:109
diff --git a/master/eigensolver_2eigensolver_8h_source.html b/master/eigensolver_2eigensolver_8h_source.html
index d2d4c70104..e97112b26d 100644
--- a/master/eigensolver_2eigensolver_8h_source.html
+++ b/master/eigensolver_2eigensolver_8h_source.html
@@ -106,54 +106,60 @@
50 DLAF_ASSERT(square_blocksize(eigenvectors), eigenvectors);
51 DLAF_ASSERT(eigenvectors.size() == mat.size(), eigenvectors, mat);
52 DLAF_ASSERT(eigenvectors.blockSize() == mat.blockSize(), eigenvectors, mat);
-
-
-
+ 53 DLAF_ASSERT(single_tile_per_block(mat), mat);
+ 54 DLAF_ASSERT(single_tile_per_block(eigenvalues), eigenvalues);
+ 55 DLAF_ASSERT(single_tile_per_block(eigenvectors), eigenvectors);
- 69 template <Backend B, Device D,
class T>
-
- 71 const SizeType size = mat.size().rows();
-
-
-
-
- 76 eigensolver<B, D, T>(uplo, mat, eigenvalues, eigenvectors);
- 77 return {std::move(eigenvalues), std::move(eigenvectors)};
-
-
- 95 template <Backend B, Device D,
class T>
-
- 97 Matrix<BaseType<T>, D>& eigenvalues, Matrix<T, D>& eigenvectors) {
- 98 DLAF_ASSERT(matrix::equal_process_grid(mat, grid), mat);
- 99 DLAF_ASSERT(square_size(mat), mat);
- 100 DLAF_ASSERT(square_blocksize(mat), mat);
- 101 DLAF_ASSERT(matrix::local_matrix(eigenvalues), eigenvalues);
- 102 DLAF_ASSERT(eigenvalues.size().rows() == eigenvectors.size().rows(), eigenvalues, eigenvectors);
- 103 DLAF_ASSERT(eigenvalues.blockSize().rows() == eigenvectors.blockSize().rows(), eigenvalues,
-
- 105 DLAF_ASSERT(matrix::equal_process_grid(eigenvectors, grid), eigenvectors);
- 106 DLAF_ASSERT(square_size(eigenvectors), eigenvectors);
- 107 DLAF_ASSERT(square_blocksize(eigenvectors), eigenvectors);
- 108 DLAF_ASSERT(eigenvectors.size() == mat.size(), eigenvectors, mat);
- 109 DLAF_ASSERT(eigenvectors.blockSize() == mat.blockSize(), eigenvectors, mat);
-
-
-
-
- 127 template <Backend B, Device D,
class T>
-
- 129 const SizeType size = mat.size().rows();
-
-
-
-
- 134 eigensolver<B, D, T>(grid, uplo, mat, eigenvalues, eigenvectors);
- 135 return {std::move(eigenvalues), std::move(eigenvectors)};
-
-
+
+
+
+ 72 template <Backend B, Device D,
class T>
+
+ 74 const SizeType size = mat.size().rows();
+
+
+
+
+ 79 eigensolver<B, D, T>(uplo, mat, eigenvalues, eigenvectors);
+ 80 return {std::move(eigenvalues), std::move(eigenvectors)};
+
+
+ 98 template <Backend B, Device D,
class T>
+
+ 100 Matrix<BaseType<T>, D>& eigenvalues, Matrix<T, D>& eigenvectors) {
+ 101 DLAF_ASSERT(matrix::equal_process_grid(mat, grid), mat);
+ 102 DLAF_ASSERT(square_size(mat), mat);
+ 103 DLAF_ASSERT(square_blocksize(mat), mat);
+ 104 DLAF_ASSERT(matrix::local_matrix(eigenvalues), eigenvalues);
+ 105 DLAF_ASSERT(eigenvalues.size().rows() == eigenvectors.size().rows(), eigenvalues, eigenvectors);
+ 106 DLAF_ASSERT(eigenvalues.blockSize().rows() == eigenvectors.blockSize().rows(), eigenvalues,
+
+ 108 DLAF_ASSERT(matrix::equal_process_grid(eigenvectors, grid), eigenvectors);
+ 109 DLAF_ASSERT(square_size(eigenvectors), eigenvectors);
+ 110 DLAF_ASSERT(square_blocksize(eigenvectors), eigenvectors);
+ 111 DLAF_ASSERT(eigenvectors.size() == mat.size(), eigenvectors, mat);
+ 112 DLAF_ASSERT(eigenvectors.blockSize() == mat.blockSize(), eigenvectors, mat);
+ 113 DLAF_ASSERT(single_tile_per_block(mat), mat);
+ 114 DLAF_ASSERT(single_tile_per_block(eigenvalues), eigenvalues);
+ 115 DLAF_ASSERT(single_tile_per_block(eigenvectors), eigenvectors);
+
+
+
+
+ 133 template <Backend B, Device D,
class T>
+
+ 135 const SizeType size = mat.size().rows();
+
+
+
+
+ 140 eigensolver<B, D, T>(grid, uplo, mat, eigenvalues, eigenvectors);
+ 141 return {std::move(eigenvalues), std::move(eigenvectors)};
+
+
Definition: communicator_grid.h:42
-
+
void eigensolver(blas::Uplo uplo, Matrix< T, D > &mat, Matrix< BaseType< T >, D > &eigenvalues, Matrix< T, D > &eigenvectors)
Definition: eigensolver.h:39
diff --git a/master/eigensolver_2reduction__to__band_2impl_8h_source.html b/master/eigensolver_2reduction__to__band_2impl_8h_source.html
index fc82107637..e0d7136b66 100644
--- a/master/eigensolver_2reduction__to__band_2impl_8h_source.html
+++ b/master/eigensolver_2reduction__to__band_2impl_8h_source.html
@@ -113,1361 +113,1359 @@
-
-
-
-
- 49 #include <dlaf/sender/traits.h>
-
-
-
-
- 54 namespace dlaf::eigensolver::internal {
-
-
-
- 58 void reduceColumnVectors(std::vector<common::internal::vector<T>>& columnVectors) {
- 59 for (std::size_t i = 1; i < columnVectors.size(); ++i) {
- 60 DLAF_ASSERT_HEAVY(columnVectors[0].size() == columnVectors[i].size(), columnVectors[0].size(),
- 61 columnVectors[i].size());
- 62 for (SizeType j = 0; j < columnVectors[0].size(); ++j)
- 63 columnVectors[0][j] += columnVectors[i][j];
-
-
-
-
-
-
- 70 template <Device D,
class T>
- 71 std::array<T, 2> computeX0AndSquares(
const bool has_head,
const std::vector<matrix::Tile<T, D>>& panel,
-
- 73 std::array<T, 2> x0_and_squares{0, 0};
- 74 auto it_begin = panel.begin();
- 75 auto it_end = panel.end();
-
- 77 common::internal::SingleThreadedBlasScope single;
-
-
- 80 auto& tile_v0 = *it_begin++;
-
- 82 const TileElementIndex idx_x0(j, j);
- 83 x0_and_squares[0] = tile_v0(idx_x0);
-
- 85 T* reflector_ptr = tile_v0.ptr({idx_x0});
-
- 87 blas::dot(tile_v0.size().rows() - idx_x0.row(), reflector_ptr, 1, reflector_ptr, 1);
-
-
- 90 for (
auto it = it_begin; it != it_end; ++it) {
- 91 const auto& tile = *it;
-
- 93 T* reflector_ptr = tile.ptr({0, j});
- 94 x0_and_squares[1] += blas::dot(tile.size().rows(), reflector_ptr, 1, reflector_ptr, 1);
-
- 96 return x0_and_squares;
-
-
- 99 template <Device D,
class T>
- 100 T computeReflectorAndTau(
const bool has_head,
const std::vector<matrix::Tile<T, D>>& panel,
- 101 const SizeType j, std::array<T, 2> x0_and_squares) {
- 102 const T
norm = std::sqrt(x0_and_squares[1]);
- 103 const T x0 = x0_and_squares[0];
- 104 const T y = std::signbit(std::real(x0_and_squares[0])) ?
norm : -
norm;
- 105 const T tau = (y - x0) / y;
-
- 107 auto it_begin = panel.begin();
- 108 auto it_end = panel.end();
-
- 110 common::internal::SingleThreadedBlasScope single;
-
-
- 113 const auto& tile_v0 = *it_begin++;
-
- 115 const TileElementIndex idx_x0(j, j);
-
-
- 118 if (j + 1 < tile_v0.size().rows()) {
- 119 T* v = tile_v0.ptr({j + 1, j});
- 120 blas::scal(tile_v0.size().rows() - (j + 1), T(1) / (x0 - y), v, 1);
-
-
-
- 124 for (
auto it = it_begin; it != it_end; ++it) {
-
- 126 T* v = tile_v.ptr({0, j});
- 127 blas::scal(tile_v.size().rows(), T(1) / (x0 - y), v, 1);
-
-
-
-
-
- 133 template <Device D,
class T>
- 134 void computeWTrailingPanel(
const bool has_head,
const std::vector<matrix::Tile<T, D>>& panel,
- 135 common::internal::vector<T>& w, SizeType j,
const SizeType pt_cols,
- 136 const std::size_t begin,
const std::size_t end) {
-
-
-
-
-
- 142 const TileElementIndex index_el_x0(j, j);
- 143 bool has_first_component = has_head;
-
- 145 common::internal::SingleThreadedBlasScope single;
-
-
- 148 for (
auto index = begin; index < end; ++index) {
- 149 const matrix::Tile<const T, D>& tile_a = panel[index];
- 150 const SizeType first_element = has_first_component ? index_el_x0.row() : 0;
-
- 152 TileElementIndex pt_start{first_element, index_el_x0.col() + 1};
- 153 TileElementSize pt_size{tile_a.size().rows() - pt_start.row(), pt_cols};
- 154 TileElementIndex v_start{first_element, index_el_x0.col()};
-
- 156 if (has_first_component) {
- 157 const TileElementSize offset{1, 0};
-
-
- 160 blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, offset.rows(), pt_size.cols(), T(1),
- 161 tile_a.ptr(pt_start), tile_a.ld(), &fake_v, 1, T(0), w.data(), 1);
-
- 163 pt_start = pt_start + offset;
- 164 v_start = v_start + offset;
- 165 pt_size = pt_size - offset;
-
- 167 has_first_component =
false;
-
-
- 170 if (pt_start.isIn(tile_a.size())) {
-
- 172 blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, pt_size.rows(), pt_size.cols(), T(1),
- 173 tile_a.ptr(pt_start), tile_a.ld(), tile_a.ptr(v_start), 1, T(1), w.data(), 1);
-
-
-
-
- 178 template <Device D,
class T>
- 179 void updateTrailingPanel(
const bool has_head,
const std::vector<matrix::Tile<T, D>>& panel, SizeType j,
- 180 const std::vector<T>& w,
const T tau,
const std::size_t begin,
- 181 const std::size_t end) {
- 182 const TileElementIndex index_el_x0(j, j);
-
- 184 bool has_first_component = has_head;
-
- 186 common::internal::SingleThreadedBlasScope single;
-
-
- 189 for (
auto index = begin; index < end; ++index) {
- 190 const matrix::Tile<T, D>& tile_a = panel[index];
- 191 const SizeType first_element = has_first_component ? index_el_x0.row() : 0;
-
- 193 TileElementIndex pt_start{first_element, index_el_x0.col() + 1};
- 194 TileElementSize pt_size{tile_a.size().rows() - pt_start.row(),
- 195 tile_a.size().cols() - pt_start.col()};
- 196 TileElementIndex v_start{first_element, index_el_x0.col()};
-
- 198 if (has_first_component) {
- 199 const TileElementSize offset{1, 0};
-
-
-
- 203 blas::ger(blas::Layout::ColMajor, 1, pt_size.cols(), -dlaf::conj(tau), &fake_v, 1, w.data(), 1,
- 204 tile_a.ptr(pt_start), tile_a.ld());
-
- 206 pt_start = pt_start + offset;
- 207 v_start = v_start + offset;
- 208 pt_size = pt_size - offset;
-
- 210 has_first_component =
false;
-
-
- 213 if (pt_start.isIn(tile_a.size())) {
-
- 215 blas::ger(blas::Layout::ColMajor, pt_size.rows(), pt_size.cols(), -dlaf::conj(tau),
- 216 tile_a.ptr(v_start), 1, w.data(), 1, tile_a.ptr(pt_start), tile_a.ld());
-
-
-
-
- 221 template <Backend B,
typename ASender,
typename WSender,
typename XSender>
- 222 void hemmDiag(pika::execution::thread_priority priority, ASender&& tile_a, WSender&& tile_w,
-
- 224 using T = dlaf::internal::SenderElementType<ASender>;
- 225 pika::execution::experimental::start_detached(
- 226 dlaf::internal::whenAllLift(blas::Side::Left, blas::Uplo::Lower, T(1),
- 227 std::forward<ASender>(tile_a), std::forward<WSender>(tile_w), T(1),
- 228 std::forward<XSender>(tile_x)) |
-
-
-
-
- 233 template <Backend B,
typename ASender,
typename WSender,
typename XSender>
- 234 void hemmOffDiag(pika::execution::thread_priority priority, blas::Op op, ASender&& tile_a,
- 235 WSender&& tile_w, XSender&& tile_x) {
- 236 using T = dlaf::internal::SenderElementType<ASender>;
- 237 pika::execution::experimental::start_detached(
- 238 dlaf::internal::whenAllLift(op, blas::Op::NoTrans, T(1), std::forward<ASender>(tile_a),
- 239 std::forward<WSender>(tile_w), T(1), std::forward<XSender>(tile_x)) |
-
-
-
- 243 template <Backend B,
typename VSender,
typename XSender,
typename ASender>
- 244 void her2kDiag(pika::execution::thread_priority priority, VSender&& tile_v, XSender&& tile_x,
-
- 246 using T = dlaf::internal::SenderElementType<VSender>;
- 247 pika::execution::experimental::start_detached(
- 248 dlaf::internal::whenAllLift(blas::Uplo::Lower, blas::Op::NoTrans, T(-1),
- 249 std::forward<VSender>(tile_v), std::forward<XSender>(tile_x),
- 250 BaseType<T>(1), std::forward<ASender>(tile_a)) |
-
-
-
-
- 255 template <Backend B,
typename ASender,
typename BSender,
typename CSender>
- 256 void her2kOffDiag(pika::execution::thread_priority priority, ASender&& tile_a, BSender&& tile_b,
-
- 258 using T = dlaf::internal::SenderElementType<ASender>;
- 259 pika::execution::experimental::start_detached(
- 260 dlaf::internal::whenAllLift(blas::Op::NoTrans, blas::Op::ConjTrans, T(-1),
- 261 std::forward<ASender>(tile_a), std::forward<BSender>(tile_b), T(1),
- 262 std::forward<CSender>(tile_c)) |
-
-
-
-
-
- 268 template <Device D,
class T>
- 269 T computeReflector(
const std::vector<matrix::Tile<T, D>>& panel, SizeType j) {
- 270 constexpr
bool has_head =
true;
-
- 272 std::array<T, 2> x0_and_squares = computeX0AndSquares(has_head, panel, j);
-
- 274 auto tau = computeReflectorAndTau(has_head, panel, j, std::move(x0_and_squares));
-
-
-
-
- 279 template <
class MatrixLikeA,
class MatrixLikeTaus>
- 280 void computePanelReflectors(MatrixLikeA& mat_a, MatrixLikeTaus& mat_taus,
const SizeType j_sub,
- 281 const matrix::SubPanelView& panel_view) {
- 282 static Device constexpr D = MatrixLikeA::device;
- 283 using T =
typename MatrixLikeA::ElementType;
- 284 namespace ex = pika::execution::experimental;
- 285 namespace di = dlaf::internal;
-
- 287 std::vector<matrix::ReadWriteTileSender<T, D>> panel_tiles;
- 288 panel_tiles.reserve(
to_sizet(std::distance(panel_view.iteratorLocal().begin(),
- 289 panel_view.iteratorLocal().end())));
- 290 for (
const auto& i : panel_view.iteratorLocal()) {
- 291 const matrix::SubTileSpec& spec = panel_view(i);
- 292 panel_tiles.emplace_back(matrix::splitTile(mat_a.readwrite(i), spec));
-
-
- 295 const std::size_t nthreads = getReductionToBandPanelNWorkers();
-
- 297 ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nthreads),
- 298 std::vector<common::internal::vector<T>>{}),
- 299 mat_taus.readwrite(LocalTileIndex(j_sub, 0)),
- 300 ex::when_all_vector(std::move(panel_tiles))) |
- 301 ex::transfer(di::getBackendScheduler<Backend::MC>(pika::execution::thread_priority::high)) |
- 302 ex::bulk(nthreads, [nthreads, cols = panel_view.cols()](
const std::size_t index,
auto& barrier_ptr,
- 303 auto& w,
auto& taus,
auto& tiles) {
- 304 const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
- 305 const std::size_t batch_size = util::ceilDiv(tiles.size(), nthreads);
- 306 const std::size_t begin = index * batch_size;
- 307 const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
- 308 const SizeType nrefls = taus.size().rows();
-
-
-
-
-
- 314 for (SizeType j = 0; j < nrefls; ++j) {
-
-
- 317 taus({j, 0}) = computeReflector(tiles, j);
-
-
- 320 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-
- 323 const SizeType pt_cols = cols - (j + 1);
-
-
- 326 const bool has_head = (index == 0);
-
- 328 w[index] = common::internal::vector<T>(pt_cols, 0);
- 329 computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
- 330 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-
-
- 334 dlaf::eigensolver::internal::reduceColumnVectors(w);
- 335 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-
- 338 updateTrailingPanel(has_head, tiles, j, w[0], taus({j, 0}), begin, end);
- 339 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-
- 342 ex::start_detached(std::move(s));
-
-
- 345 template <Backend B, Device D,
class T>
- 346 void setupReflectorPanelV(
bool has_head,
const matrix::SubPanelView& panel_view,
const SizeType nrefls,
- 347 matrix::Panel<Coord::Col, T, D>& v, matrix::Matrix<const T, D>& mat_a,
- 348 bool force_copy =
false) {
- 349 namespace ex = pika::execution::experimental;
-
- 351 using pika::execution::thread_priority;
-
-
-
-
-
-
-
-
- 360 auto it_begin = panel_view.iteratorLocal().begin();
- 361 auto it_end = panel_view.iteratorLocal().end();
-
-
- 364 const LocalTileIndex i = *it_begin;
- 365 matrix::SubTileSpec spec = panel_view(i);
-
-
-
-
- 370 spec.size = {spec.size.rows(), std::min(nrefls, spec.size.cols())};
-
-
-
-
-
- 376 ex::start_detached(dlaf::internal::whenAllLift(
splitTile(mat_a.read(i), spec), v.readwrite(i)) |
-
- 378 ex::start_detached(dlaf::internal::whenAllLift(blas::Uplo::Upper, T(0), T(1), v.readwrite(i)) |
-
-
-
-
-
-
-
- 386 for (
auto it = it_begin; it < it_end; ++it) {
- 387 const LocalTileIndex idx = *it;
- 388 const matrix::SubTileSpec& spec = panel_view(idx);
-
-
-
-
-
-
-
-
- 397 ex::start_detached(ex::when_all(matrix::splitTile(mat_a.read(idx), spec), v.readwrite(idx)) |
-
-
- 400 v.setTile(idx, matrix::splitTile(mat_a.read(idx), spec));
-
-
-
- 404 template <Backend B, Device D,
class T>
- 405 void trmmComputeW(matrix::Panel<Coord::Col, T, D>& w, matrix::Panel<Coord::Col, T, D>& v,
- 406 matrix::ReadOnlyTileSender<T, D> tile_t) {
- 407 namespace ex = pika::execution::experimental;
-
- 409 using pika::execution::thread_priority;
- 410 using namespace blas;
-
- 412 auto it = w.iteratorLocal();
-
- 414 for (
const auto& index_i : it) {
- 415 ex::start_detached(dlaf::internal::whenAllLift(Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit,
- 416 T(1), tile_t, v.read(index_i), w.readwrite(index_i)) |
-
-
-
-
- 421 ex::start_detached(std::move(tile_t));
-
-
-
- 425 template <Backend B, Device D,
class T>
- 426 void gemmUpdateX(matrix::Panel<Coord::Col, T, D>& x, matrix::Matrix<const T, D>& w2,
- 427 matrix::Panel<Coord::Col, const T, D>& v) {
- 428 namespace ex = pika::execution::experimental;
-
- 430 using pika::execution::thread_priority;
- 431 using namespace blas;
-
-
- 434 for (
const auto& index_i : v.iteratorLocal())
- 435 ex::start_detached(dlaf::internal::whenAllLift(Op::NoTrans, Op::NoTrans, T(-0.5), v.read(index_i),
- 436 w2.read(LocalTileIndex(0, 0)), T(1),
- 437 x.readwrite(index_i)) |
-
-
-
- 441 template <Backend B, Device D,
class T>
- 442 void hemmComputeX(matrix::Panel<Coord::Col, T, D>& x,
const matrix::SubMatrixView& view,
- 443 matrix::Matrix<const T, D>& a, matrix::Panel<Coord::Col, const T, D>& w) {
- 444 namespace ex = pika::execution::experimental;
-
- 446 using pika::execution::thread_priority;
-
- 448 const auto dist = a.distribution();
-
-
-
-
-
- 454 matrix::util::set0<B>(thread_priority::high, x);
-
- 456 const LocalTileIndex at_offset = view.begin();
-
- 458 for (SizeType i = at_offset.row(); i < dist.localNrTiles().rows(); ++i) {
- 459 const auto limit = i + 1;
- 460 for (SizeType j = limit - 1; j >= at_offset.col(); --j) {
- 461 const LocalTileIndex ij{i, j};
-
- 463 const bool is_diagonal_tile = (ij.row() == ij.col());
-
- 465 const auto& tile_a =
splitTile(a.read(ij), view(ij));
-
- 467 if (is_diagonal_tile) {
- 468 hemmDiag<B>(thread_priority::high, tile_a, w.read(ij), x.readwrite(ij));
-
-
-
-
-
-
-
-
-
- 478 const LocalTileIndex index_x(Coord::Row, ij.row());
- 479 const LocalTileIndex index_w(Coord::Row, ij.col());
- 480 hemmOffDiag<B>(thread_priority::high, blas::Op::NoTrans, tile_a, w.read(index_w),
- 481 x.readwrite(index_x));
-
-
-
- 485 const LocalTileIndex index_pretended =
transposed(ij);
- 486 const LocalTileIndex index_x(Coord::Row, index_pretended.row());
- 487 const LocalTileIndex index_w(Coord::Row, index_pretended.col());
- 488 hemmOffDiag<B>(thread_priority::high, blas::Op::ConjTrans, tile_a, w.read(index_w),
- 489 x.readwrite(index_x));
-
-
-
-
-
-
- 496 template <Backend B, Device D,
class T>
- 497 void gemmComputeW2(matrix::Matrix<T, D>& w2, matrix::Panel<Coord::Col, const T, D>& w,
- 498 matrix::Panel<Coord::Col, const T, D>& x) {
- 499 using pika::execution::thread_priority;
-
- 501 namespace ex = pika::execution::experimental;
-
-
-
-
-
- 507 ex::start_detached(w2.readwrite(LocalTileIndex(0, 0)) |
-
-
- 510 using namespace blas;
-
- 512 for (
const auto& index_tile : w.iteratorLocal())
- 513 ex::start_detached(dlaf::internal::whenAllLift(Op::ConjTrans, Op::NoTrans, T(1), w.read(index_tile),
- 514 x.read(index_tile), T(1),
- 515 w2.readwrite(LocalTileIndex(0, 0))) |
-
-
-
- 519 template <Backend B, Device D,
class T>
- 520 void her2kUpdateTrailingMatrix(
const matrix::SubMatrixView& view, matrix::Matrix<T, D>& a,
- 521 matrix::Panel<Coord::Col, const T, D>& x,
- 522 matrix::Panel<Coord::Col, const T, D>& v) {
- 523 static_assert(std::is_signed_v<BaseType<T>>,
"alpha in computations requires to be -1");
-
- 525 using pika::execution::thread_priority;
-
- 527 const auto dist = a.distribution();
-
- 529 const LocalTileIndex at_start = view.begin();
-
- 531 for (SizeType i = at_start.row(); i < dist.localNrTiles().rows(); ++i) {
- 532 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
- 533 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
- 534 for (SizeType j = at_start.col(); j < limit; ++j) {
- 535 const LocalTileIndex ij_local{i, j};
- 536 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
-
- 538 const bool is_diagonal_tile = (ij.row() == ij.col());
-
- 540 auto getSubA = [&a, &view, ij_local]() {
- 541 return splitTile(a.readwrite(ij_local), view(ij_local));
-
-
-
-
- 546 const auto priority = (j == at_start.col()) ? thread_priority::high : thread_priority::normal;
-
- 548 if (is_diagonal_tile) {
- 549 her2kDiag<B>(priority, v.read(ij_local), x.read(ij_local), getSubA());
-
-
-
- 553 her2kOffDiag<B>(priority, x.read(ij_local), v.read(
transposed(ij_local)), getSubA());
-
-
- 556 her2kOffDiag<B>(priority, v.read(ij_local), x.read(
transposed(ij_local)), getSubA());
-
-
-
-
-
-
-
- 564 namespace distributed {
- 565 template <Device D,
class T>
- 566 T computeReflector(
const bool has_head, comm::Communicator& communicator,
- 567 const std::vector<matrix::Tile<T, D>>& panel, SizeType j) {
- 568 std::array<T, 2> x0_and_squares = computeX0AndSquares(has_head, panel, j);
-
-
-
-
-
-
-
-
-
-
-
-
-
- 582 comm::sync::allReduceInPlace(communicator, MPI_SUM,
- 583 common::make_data(x0_and_squares.data(),
-
-
- 586 auto tau = computeReflectorAndTau(has_head, panel, j, std::move(x0_and_squares));
-
-
-
-
- 591 template <
class MatrixLikeA,
class MatrixLikeTaus,
class TriggerSender,
class CommSender>
- 592 void computePanelReflectors(TriggerSender&& trigger, comm::IndexT_MPI rank_v0,
- 593 CommSender&& mpi_col_chain_panel, MatrixLikeA& mat_a,
- 594 MatrixLikeTaus& mat_taus, SizeType j_sub,
- 595 const matrix::SubPanelView& panel_view) {
- 596 static Device constexpr D = MatrixLikeA::device;
- 597 using T =
typename MatrixLikeA::ElementType;
- 598 namespace ex = pika::execution::experimental;
- 599 namespace di = dlaf::internal;
-
- 601 std::vector<matrix::ReadWriteTileSender<T, D>> panel_tiles;
- 602 panel_tiles.reserve(
to_sizet(std::distance(panel_view.iteratorLocal().begin(),
- 603 panel_view.iteratorLocal().end())));
- 604 for (
const auto& i : panel_view.iteratorLocal()) {
- 605 const matrix::SubTileSpec& spec = panel_view(i);
- 606 panel_tiles.emplace_back(matrix::splitTile(mat_a.readwrite(i), spec));
-
-
- 609 const std::size_t nthreads = getReductionToBandPanelNWorkers();
-
- 611 ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nthreads),
- 612 std::vector<common::internal::vector<T>>{}),
- 613 mat_taus.readwrite(GlobalTileIndex(j_sub, 0)),
- 614 ex::when_all_vector(std::move(panel_tiles)),
- 615 std::forward<CommSender>(mpi_col_chain_panel), std::forward<TriggerSender>(trigger)) |
- 616 ex::transfer(di::getBackendScheduler<Backend::MC>(pika::execution::thread_priority::high)) |
- 617 ex::bulk(nthreads, [nthreads, rank_v0,
- 618 cols = panel_view.cols()](
const std::size_t index,
auto& barrier_ptr,
auto& w,
- 619 auto& taus,
auto& tiles,
auto&& pcomm) {
- 620 const bool rankHasHead = rank_v0 == pcomm.get().rank();
-
- 622 const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
- 623 const std::size_t batch_size = util::ceilDiv(tiles.size(), nthreads);
- 624 const std::size_t begin = index * batch_size;
- 625 const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
- 626 const SizeType nrefls = taus.size().rows();
-
-
-
-
-
- 632 for (SizeType j = 0; j < nrefls; ++j) {
-
-
- 635 const bool has_head = rankHasHead;
- 636 taus({j, 0}) = computeReflector(has_head, pcomm.get(), tiles, j);
-
- 638 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-
- 641 const SizeType pt_cols = cols - (j + 1);
-
-
-
- 645 const bool has_head = rankHasHead && (index == 0);
-
- 647 w[index] = common::internal::vector<T>(pt_cols, 0);
- 648 computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
- 649 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-
-
- 653 dlaf::eigensolver::internal::reduceColumnVectors(w);
- 654 comm::sync::allReduceInPlace(pcomm.get(), MPI_SUM, common::make_data(w[0].data(), pt_cols));
-
- 656 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-
- 659 updateTrailingPanel(has_head, tiles, j, w[0], taus({j, 0}), begin, end);
- 660 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-
- 663 ex::start_detached(std::move(s));
-
-
- 666 template <Backend B, Device D,
class T>
- 667 void hemmComputeX(comm::IndexT_MPI reducer_col, matrix::Panel<Coord::Col, T, D>& x,
- 668 matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>& xt,
- 669 const matrix::SubMatrixView& view, matrix::Matrix<const T, D>& a,
- 670 matrix::Panel<Coord::Col, const T, D>& w,
- 671 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& wt,
- 672 common::Pipeline<comm::Communicator>& mpi_row_chain,
- 673 common::Pipeline<comm::Communicator>& mpi_col_chain) {
- 674 namespace ex = pika::execution::experimental;
-
- 676 using pika::execution::thread_priority;
-
- 678 const auto dist = a.distribution();
- 679 const auto rank = dist.rankIndex();
-
-
-
-
-
- 685 matrix::util::set0<B>(thread_priority::high, x);
- 686 matrix::util::set0<B>(thread_priority::high, xt);
-
- 688 const LocalTileIndex at_offset = view.begin();
-
- 690 for (SizeType i = at_offset.row(); i < dist.localNrTiles().rows(); ++i) {
- 691 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
- 692 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
- 693 for (SizeType j = limit - 1; j >= at_offset.col(); --j) {
- 694 const LocalTileIndex ij_local{i, j};
- 695 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
-
- 697 const bool is_diagonal_tile = (ij.row() == ij.col());
-
- 699 auto tile_a =
splitTile(a.read(ij), view(ij_local));
-
- 701 if (is_diagonal_tile) {
- 702 hemmDiag<B>(thread_priority::high, std::move(tile_a), w.read(ij_local), x.readwrite(ij_local));
-
-
-
-
-
-
-
-
- 711 hemmOffDiag<B>(thread_priority::high, blas::Op::NoTrans, tile_a, wt.read(ij_local),
- 712 x.readwrite(ij_local));
-
-
-
-
-
-
-
-
-
- 722 const auto owner = dist.template rankGlobalTile<Coord::Row>(ij.col());
-
- 724 const LocalTileIndex index_x{dist.template localTileFromGlobalTile<Coord::Row>(ij.col()), 0};
- 725 const LocalTileIndex index_xt{0, ij_local.col()};
-
- 727 auto tile_x = (dist.rankIndex().row() == owner) ? x.readwrite(index_x) : xt.readwrite(index_xt);
-
- 729 hemmOffDiag<B>(thread_priority::high, blas::Op::ConjTrans, std::move(tile_a), w.read(ij_local),
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 745 for (
const auto& index_xt : xt.iteratorLocal()) {
- 746 const auto index_k = dist.template globalTileFromLocalTile<Coord::Col>(index_xt.col());
- 747 const auto rank_owner_row = dist.template rankGlobalTile<Coord::Row>(index_k);
-
- 749 if (rank_owner_row == rank.row()) {
-
-
-
-
-
-
- 756 const auto i = dist.template localTileFromGlobalTile<Coord::Row>(index_k);
- 757 ex::start_detached(comm::scheduleReduceRecvInPlace(mpi_col_chain(), MPI_SUM, x.readwrite({i, 0})));
-
-
- 760 ex::start_detached(comm::scheduleReduceSend(mpi_col_chain(), rank_owner_row, MPI_SUM,
-
-
-
-
-
-
-
-
- 769 for (
const auto& index_x : x.iteratorLocal()) {
- 770 if (reducer_col == rank.col())
- 771 ex::start_detached(comm::scheduleReduceRecvInPlace(mpi_row_chain(), MPI_SUM,
- 772 x.readwrite(index_x)));
-
- 774 ex::start_detached(comm::scheduleReduceSend(mpi_row_chain(), reducer_col, MPI_SUM,
-
-
-
-
- 779 template <Backend B, Device D,
class T>
- 780 void her2kUpdateTrailingMatrix(
const matrix::SubMatrixView& view, Matrix<T, D>& a,
- 781 matrix::Panel<Coord::Col, const T, D>& x,
- 782 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& vt,
- 783 matrix::Panel<Coord::Col, const T, D>& v,
- 784 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& xt) {
- 785 static_assert(std::is_signed_v<BaseType<T>>,
"alpha in computations requires to be -1");
-
- 787 using pika::execution::thread_priority;
-
- 789 const auto dist = a.distribution();
-
- 791 const LocalTileIndex at_start = view.begin();
-
- 793 for (SizeType i = at_start.row(); i < dist.localNrTiles().rows(); ++i) {
- 794 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
- 795 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
- 796 for (SizeType j = at_start.col(); j < limit; ++j) {
- 797 const LocalTileIndex ij_local{i, j};
- 798 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
-
- 800 const bool is_diagonal_tile = (ij.row() == ij.col());
-
- 802 auto getSubA = [&a, &view, ij_local]() {
- 803 return splitTile(a.readwrite(ij_local), view(ij_local));
-
-
-
-
- 808 const auto priority = (j == at_start.col()) ? thread_priority::high : thread_priority::normal;
-
- 810 if (is_diagonal_tile) {
- 811 her2kDiag<B>(priority, v.read(ij_local), x.read(ij_local), getSubA());
-
-
-
- 815 her2kOffDiag<B>(priority, x.read(ij_local), vt.read(ij_local), getSubA());
-
-
- 818 her2kOffDiag<B>(priority, v.read(ij_local), xt.read(ij_local), getSubA());
-
-
-
+
+
+
+ 48 #include <dlaf/sender/traits.h>
+
+
+
+
+ 53 namespace dlaf::eigensolver::internal {
+
+
+
+ 57 void reduceColumnVectors(std::vector<common::internal::vector<T>>& columnVectors) {
+ 58 for (std::size_t i = 1; i < columnVectors.size(); ++i) {
+ 59 DLAF_ASSERT_HEAVY(columnVectors[0].size() == columnVectors[i].size(), columnVectors[0].size(),
+ 60 columnVectors[i].size());
+ 61 for (SizeType j = 0; j < columnVectors[0].size(); ++j)
+ 62 columnVectors[0][j] += columnVectors[i][j];
+
+
+
+
+
+
+ 69 template <Device D,
class T>
+ 70 std::array<T, 2> computeX0AndSquares(
const bool has_head,
const std::vector<matrix::Tile<T, D>>& panel,
+
+ 72 std::array<T, 2> x0_and_squares{0, 0};
+ 73 auto it_begin = panel.begin();
+ 74 auto it_end = panel.end();
+
+ 76 common::internal::SingleThreadedBlasScope single;
+
+
+ 79 auto& tile_v0 = *it_begin++;
+
+ 81 const TileElementIndex idx_x0(j, j);
+ 82 x0_and_squares[0] = tile_v0(idx_x0);
+
+ 84 T* reflector_ptr = tile_v0.ptr({idx_x0});
+
+ 86 blas::dot(tile_v0.size().rows() - idx_x0.row(), reflector_ptr, 1, reflector_ptr, 1);
+
+
+ 89 for (
auto it = it_begin; it != it_end; ++it) {
+ 90 const auto& tile = *it;
+
+ 92 T* reflector_ptr = tile.ptr({0, j});
+ 93 x0_and_squares[1] += blas::dot(tile.size().rows(), reflector_ptr, 1, reflector_ptr, 1);
+
+ 95 return x0_and_squares;
+
+
+ 98 template <Device D,
class T>
+ 99 T computeReflectorAndTau(
const bool has_head,
const std::vector<matrix::Tile<T, D>>& panel,
+ 100 const SizeType j, std::array<T, 2> x0_and_squares) {
+ 101 const T
norm = std::sqrt(x0_and_squares[1]);
+ 102 const T x0 = x0_and_squares[0];
+ 103 const T y = std::signbit(std::real(x0_and_squares[0])) ?
norm : -
norm;
+ 104 const T tau = (y - x0) / y;
+
+ 106 auto it_begin = panel.begin();
+ 107 auto it_end = panel.end();
+
+ 109 common::internal::SingleThreadedBlasScope single;
+
+
+ 112 const auto& tile_v0 = *it_begin++;
+
+ 114 const TileElementIndex idx_x0(j, j);
+
+
+ 117 if (j + 1 < tile_v0.size().rows()) {
+ 118 T* v = tile_v0.ptr({j + 1, j});
+ 119 blas::scal(tile_v0.size().rows() - (j + 1), T(1) / (x0 - y), v, 1);
+
+
+
+ 123 for (
auto it = it_begin; it != it_end; ++it) {
+
+ 125 T* v = tile_v.ptr({0, j});
+ 126 blas::scal(tile_v.size().rows(), T(1) / (x0 - y), v, 1);
+
+
+
+
+
+ 132 template <Device D,
class T>
+ 133 void computeWTrailingPanel(
const bool has_head,
const std::vector<matrix::Tile<T, D>>& panel,
+ 134 common::internal::vector<T>& w, SizeType j,
const SizeType pt_cols,
+ 135 const std::size_t begin,
const std::size_t end) {
+
+
+
+
+
+ 141 const TileElementIndex index_el_x0(j, j);
+ 142 bool has_first_component = has_head;
+
+ 144 common::internal::SingleThreadedBlasScope single;
+
+
+ 147 for (
auto index = begin; index < end; ++index) {
+ 148 const matrix::Tile<const T, D>& tile_a = panel[index];
+ 149 const SizeType first_element = has_first_component ? index_el_x0.row() : 0;
+
+ 151 TileElementIndex pt_start{first_element, index_el_x0.col() + 1};
+ 152 TileElementSize pt_size{tile_a.size().rows() - pt_start.row(), pt_cols};
+ 153 TileElementIndex v_start{first_element, index_el_x0.col()};
+
+ 155 if (has_first_component) {
+ 156 const TileElementSize offset{1, 0};
+
+
+ 159 blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, offset.rows(), pt_size.cols(), T(1),
+ 160 tile_a.ptr(pt_start), tile_a.ld(), &fake_v, 1, T(0), w.data(), 1);
+
+ 162 pt_start = pt_start + offset;
+ 163 v_start = v_start + offset;
+ 164 pt_size = pt_size - offset;
+
+ 166 has_first_component =
false;
+
+
+ 169 if (pt_start.isIn(tile_a.size())) {
+
+ 171 blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, pt_size.rows(), pt_size.cols(), T(1),
+ 172 tile_a.ptr(pt_start), tile_a.ld(), tile_a.ptr(v_start), 1, T(1), w.data(), 1);
+
+
+
+
+ 177 template <Device D,
class T>
+ 178 void updateTrailingPanel(
const bool has_head,
const std::vector<matrix::Tile<T, D>>& panel, SizeType j,
+ 179 const std::vector<T>& w,
const T tau,
const std::size_t begin,
+ 180 const std::size_t end) {
+ 181 const TileElementIndex index_el_x0(j, j);
+
+ 183 bool has_first_component = has_head;
+
+ 185 common::internal::SingleThreadedBlasScope single;
+
+
+ 188 for (
auto index = begin; index < end; ++index) {
+ 189 const matrix::Tile<T, D>& tile_a = panel[index];
+ 190 const SizeType first_element = has_first_component ? index_el_x0.row() : 0;
+
+ 192 TileElementIndex pt_start{first_element, index_el_x0.col() + 1};
+ 193 TileElementSize pt_size{tile_a.size().rows() - pt_start.row(),
+ 194 tile_a.size().cols() - pt_start.col()};
+ 195 TileElementIndex v_start{first_element, index_el_x0.col()};
+
+ 197 if (has_first_component) {
+ 198 const TileElementSize offset{1, 0};
+
+
+
+ 202 blas::ger(blas::Layout::ColMajor, 1, pt_size.cols(), -dlaf::conj(tau), &fake_v, 1, w.data(), 1,
+ 203 tile_a.ptr(pt_start), tile_a.ld());
+
+ 205 pt_start = pt_start + offset;
+ 206 v_start = v_start + offset;
+ 207 pt_size = pt_size - offset;
+
+ 209 has_first_component =
false;
+
+
+ 212 if (pt_start.isIn(tile_a.size())) {
+
+ 214 blas::ger(blas::Layout::ColMajor, pt_size.rows(), pt_size.cols(), -dlaf::conj(tau),
+ 215 tile_a.ptr(v_start), 1, w.data(), 1, tile_a.ptr(pt_start), tile_a.ld());
+
+
+
+
+ 220 template <Backend B,
typename ASender,
typename WSender,
typename XSender>
+ 221 void hemmDiag(pika::execution::thread_priority priority, ASender&& tile_a, WSender&& tile_w,
+
+ 223 using T = dlaf::internal::SenderElementType<ASender>;
+ 224 pika::execution::experimental::start_detached(
+ 225 dlaf::internal::whenAllLift(blas::Side::Left, blas::Uplo::Lower, T(1),
+ 226 std::forward<ASender>(tile_a), std::forward<WSender>(tile_w), T(1),
+ 227 std::forward<XSender>(tile_x)) |
+
+
+
+
+ 232 template <Backend B,
typename ASender,
typename WSender,
typename XSender>
+ 233 void hemmOffDiag(pika::execution::thread_priority priority, blas::Op op, ASender&& tile_a,
+ 234 WSender&& tile_w, XSender&& tile_x) {
+ 235 using T = dlaf::internal::SenderElementType<ASender>;
+ 236 pika::execution::experimental::start_detached(
+ 237 dlaf::internal::whenAllLift(op, blas::Op::NoTrans, T(1), std::forward<ASender>(tile_a),
+ 238 std::forward<WSender>(tile_w), T(1), std::forward<XSender>(tile_x)) |
+
+
+
+ 242 template <Backend B,
typename VSender,
typename XSender,
typename ASender>
+ 243 void her2kDiag(pika::execution::thread_priority priority, VSender&& tile_v, XSender&& tile_x,
+
+ 245 using T = dlaf::internal::SenderElementType<VSender>;
+ 246 pika::execution::experimental::start_detached(
+ 247 dlaf::internal::whenAllLift(blas::Uplo::Lower, blas::Op::NoTrans, T(-1),
+ 248 std::forward<VSender>(tile_v), std::forward<XSender>(tile_x),
+ 249 BaseType<T>(1), std::forward<ASender>(tile_a)) |
+
+
+
+
+ 254 template <Backend B,
typename ASender,
typename BSender,
typename CSender>
+ 255 void her2kOffDiag(pika::execution::thread_priority priority, ASender&& tile_a, BSender&& tile_b,
+
+ 257 using T = dlaf::internal::SenderElementType<ASender>;
+ 258 pika::execution::experimental::start_detached(
+ 259 dlaf::internal::whenAllLift(blas::Op::NoTrans, blas::Op::ConjTrans, T(-1),
+ 260 std::forward<ASender>(tile_a), std::forward<BSender>(tile_b), T(1),
+ 261 std::forward<CSender>(tile_c)) |
+
+
+
+
+
+ 267 template <Device D,
class T>
+ 268 T computeReflector(
const std::vector<matrix::Tile<T, D>>& panel, SizeType j) {
+ 269 constexpr
bool has_head =
true;
+
+ 271 std::array<T, 2> x0_and_squares = computeX0AndSquares(has_head, panel, j);
+
+ 273 auto tau = computeReflectorAndTau(has_head, panel, j, std::move(x0_and_squares));
+
+
+
+
+ 278 template <
class MatrixLikeA,
class MatrixLikeTaus>
+ 279 void computePanelReflectors(MatrixLikeA& mat_a, MatrixLikeTaus& mat_taus,
const SizeType j_sub,
+ 280 const matrix::SubPanelView& panel_view) {
+ 281 static Device constexpr D = MatrixLikeA::device;
+ 282 using T =
typename MatrixLikeA::ElementType;
+ 283 namespace ex = pika::execution::experimental;
+ 284 namespace di = dlaf::internal;
+
+ 286 std::vector<matrix::ReadWriteTileSender<T, D>> panel_tiles;
+ 287 panel_tiles.reserve(
to_sizet(std::distance(panel_view.iteratorLocal().begin(),
+ 288 panel_view.iteratorLocal().end())));
+ 289 for (
const auto& i : panel_view.iteratorLocal()) {
+ 290 const matrix::SubTileSpec& spec = panel_view(i);
+ 291 panel_tiles.emplace_back(matrix::splitTile(mat_a.readwrite(i), spec));
+
+
+ 294 const std::size_t nthreads = getReductionToBandPanelNWorkers();
+
+ 296 ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nthreads),
+ 297 std::vector<common::internal::vector<T>>{}),
+ 298 mat_taus.readwrite(LocalTileIndex(j_sub, 0)),
+ 299 ex::when_all_vector(std::move(panel_tiles))) |
+ 300 ex::transfer(di::getBackendScheduler<Backend::MC>(pika::execution::thread_priority::high)) |
+ 301 ex::bulk(nthreads, [nthreads, cols = panel_view.cols()](
const std::size_t index,
auto& barrier_ptr,
+ 302 auto& w,
auto& taus,
auto& tiles) {
+ 303 const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
+ 304 const std::size_t batch_size = util::ceilDiv(tiles.size(), nthreads);
+ 305 const std::size_t begin = index * batch_size;
+ 306 const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
+ 307 const SizeType nrefls = taus.size().rows();
+
+
+
+
+
+ 313 for (SizeType j = 0; j < nrefls; ++j) {
+
+
+ 316 taus({j, 0}) = computeReflector(tiles, j);
+
+
+ 319 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+
+ 322 const SizeType pt_cols = cols - (j + 1);
+
+
+ 325 const bool has_head = (index == 0);
+
+ 327 w[index] = common::internal::vector<T>(pt_cols, 0);
+ 328 computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
+ 329 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+
+
+ 333 dlaf::eigensolver::internal::reduceColumnVectors(w);
+ 334 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+
+ 337 updateTrailingPanel(has_head, tiles, j, w[0], taus({j, 0}), begin, end);
+ 338 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+
+ 341 ex::start_detached(std::move(s));
+
+
+ 344 template <Backend B, Device D,
class T>
+ 345 void setupReflectorPanelV(
bool has_head,
const matrix::SubPanelView& panel_view,
const SizeType nrefls,
+ 346 matrix::Panel<Coord::Col, T, D>& v, matrix::Matrix<const T, D>& mat_a,
+ 347 bool force_copy =
false) {
+ 348 namespace ex = pika::execution::experimental;
+
+ 350 using pika::execution::thread_priority;
+
+
+
+
+
+
+
+
+ 359 auto it_begin = panel_view.iteratorLocal().begin();
+ 360 auto it_end = panel_view.iteratorLocal().end();
+
+
+ 363 const LocalTileIndex i = *it_begin;
+ 364 matrix::SubTileSpec spec = panel_view(i);
+
+
+
+
+ 369 spec.size = {spec.size.rows(), std::min(nrefls, spec.size.cols())};
+
+
+
+
+
+ 375 ex::start_detached(dlaf::internal::whenAllLift(
splitTile(mat_a.read(i), spec), v.readwrite(i)) |
+
+ 377 ex::start_detached(dlaf::internal::whenAllLift(blas::Uplo::Upper, T(0), T(1), v.readwrite(i)) |
+
+
+
+
+
+
+
+ 385 for (
auto it = it_begin; it < it_end; ++it) {
+ 386 const LocalTileIndex idx = *it;
+ 387 const matrix::SubTileSpec& spec = panel_view(idx);
+
+
+
+
+
+
+
+
+ 396 ex::start_detached(ex::when_all(matrix::splitTile(mat_a.read(idx), spec), v.readwrite(idx)) |
+
+
+ 399 v.setTile(idx, matrix::splitTile(mat_a.read(idx), spec));
+
+
+
+ 403 template <Backend B, Device D,
class T>
+ 404 void trmmComputeW(matrix::Panel<Coord::Col, T, D>& w, matrix::Panel<Coord::Col, T, D>& v,
+ 405 matrix::ReadOnlyTileSender<T, D> tile_t) {
+ 406 namespace ex = pika::execution::experimental;
+
+ 408 using pika::execution::thread_priority;
+ 409 using namespace blas;
+
+ 411 auto it = w.iteratorLocal();
+
+ 413 for (
const auto& index_i : it) {
+ 414 ex::start_detached(dlaf::internal::whenAllLift(Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit,
+ 415 T(1), tile_t, v.read(index_i), w.readwrite(index_i)) |
+
+
+
+
+ 420 ex::start_detached(std::move(tile_t));
+
+
+
+ 424 template <Backend B, Device D,
class T>
+ 425 void gemmUpdateX(matrix::Panel<Coord::Col, T, D>& x, matrix::Matrix<const T, D>& w2,
+ 426 matrix::Panel<Coord::Col, const T, D>& v) {
+ 427 namespace ex = pika::execution::experimental;
+
+ 429 using pika::execution::thread_priority;
+ 430 using namespace blas;
+
+
+ 433 for (
const auto& index_i : v.iteratorLocal())
+ 434 ex::start_detached(dlaf::internal::whenAllLift(Op::NoTrans, Op::NoTrans, T(-0.5), v.read(index_i),
+ 435 w2.read(LocalTileIndex(0, 0)), T(1),
+ 436 x.readwrite(index_i)) |
+
+
+
+ 440 template <Backend B, Device D,
class T>
+ 441 void hemmComputeX(matrix::Panel<Coord::Col, T, D>& x,
const matrix::SubMatrixView& view,
+ 442 matrix::Matrix<const T, D>& a, matrix::Panel<Coord::Col, const T, D>& w) {
+ 443 namespace ex = pika::execution::experimental;
+
+ 445 using pika::execution::thread_priority;
+
+ 447 const auto dist = a.distribution();
+
+
+
+
+
+ 453 matrix::util::set0<B>(thread_priority::high, x);
+
+ 455 const LocalTileIndex at_offset = view.begin();
+
+ 457 for (SizeType i = at_offset.row(); i < dist.localNrTiles().rows(); ++i) {
+ 458 const auto limit = i + 1;
+ 459 for (SizeType j = limit - 1; j >= at_offset.col(); --j) {
+ 460 const LocalTileIndex ij{i, j};
+
+ 462 const bool is_diagonal_tile = (ij.row() == ij.col());
+
+ 464 const auto& tile_a =
splitTile(a.read(ij), view(ij));
+
+ 466 if (is_diagonal_tile) {
+ 467 hemmDiag<B>(thread_priority::high, tile_a, w.read(ij), x.readwrite(ij));
+
+
+
+
+
+
+
+
+
+ 477 const LocalTileIndex index_x(Coord::Row, ij.row());
+ 478 const LocalTileIndex index_w(Coord::Row, ij.col());
+ 479 hemmOffDiag<B>(thread_priority::high, blas::Op::NoTrans, tile_a, w.read(index_w),
+ 480 x.readwrite(index_x));
+
+
+
+ 484 const LocalTileIndex index_pretended =
transposed(ij);
+ 485 const LocalTileIndex index_x(Coord::Row, index_pretended.row());
+ 486 const LocalTileIndex index_w(Coord::Row, index_pretended.col());
+ 487 hemmOffDiag<B>(thread_priority::high, blas::Op::ConjTrans, tile_a, w.read(index_w),
+ 488 x.readwrite(index_x));
+
+
+
+
+
+
+ 495 template <Backend B, Device D,
class T>
+ 496 void gemmComputeW2(matrix::Matrix<T, D>& w2, matrix::Panel<Coord::Col, const T, D>& w,
+ 497 matrix::Panel<Coord::Col, const T, D>& x) {
+ 498 using pika::execution::thread_priority;
+
+ 500 namespace ex = pika::execution::experimental;
+
+
+
+
+
+ 506 ex::start_detached(w2.readwrite(LocalTileIndex(0, 0)) |
+
+
+ 509 using namespace blas;
+
+ 511 for (
const auto& index_tile : w.iteratorLocal())
+ 512 ex::start_detached(dlaf::internal::whenAllLift(Op::ConjTrans, Op::NoTrans, T(1), w.read(index_tile),
+ 513 x.read(index_tile), T(1),
+ 514 w2.readwrite(LocalTileIndex(0, 0))) |
+
+
+
+ 518 template <Backend B, Device D,
class T>
+ 519 void her2kUpdateTrailingMatrix(
const matrix::SubMatrixView& view, matrix::Matrix<T, D>& a,
+ 520 matrix::Panel<Coord::Col, const T, D>& x,
+ 521 matrix::Panel<Coord::Col, const T, D>& v) {
+ 522 static_assert(std::is_signed_v<BaseType<T>>,
"alpha in computations requires to be -1");
+
+ 524 using pika::execution::thread_priority;
+
+ 526 const auto dist = a.distribution();
+
+ 528 const LocalTileIndex at_start = view.begin();
+
+ 530 for (SizeType i = at_start.row(); i < dist.localNrTiles().rows(); ++i) {
+ 531 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
+ 532 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
+ 533 for (SizeType j = at_start.col(); j < limit; ++j) {
+ 534 const LocalTileIndex ij_local{i, j};
+ 535 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
+
+ 537 const bool is_diagonal_tile = (ij.row() == ij.col());
+
+ 539 auto getSubA = [&a, &view, ij_local]() {
+ 540 return splitTile(a.readwrite(ij_local), view(ij_local));
+
+
+
+
+ 545 const auto priority = (j == at_start.col()) ? thread_priority::high : thread_priority::normal;
+
+ 547 if (is_diagonal_tile) {
+ 548 her2kDiag<B>(priority, v.read(ij_local), x.read(ij_local), getSubA());
+
+
+
+ 552 her2kOffDiag<B>(priority, x.read(ij_local), v.read(
transposed(ij_local)), getSubA());
+
+
+ 555 her2kOffDiag<B>(priority, v.read(ij_local), x.read(
transposed(ij_local)), getSubA());
+
+
+
+
+
+
+
+ 563 namespace distributed {
+ 564 template <Device D,
class T>
+ 565 T computeReflector(
const bool has_head, comm::Communicator& communicator,
+ 566 const std::vector<matrix::Tile<T, D>>& panel, SizeType j) {
+ 567 std::array<T, 2> x0_and_squares = computeX0AndSquares(has_head, panel, j);
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 581 comm::sync::allReduceInPlace(communicator, MPI_SUM,
+ 582 common::make_data(x0_and_squares.data(),
+
+
+ 585 auto tau = computeReflectorAndTau(has_head, panel, j, std::move(x0_and_squares));
+
+
+
+
+ 590 template <
class MatrixLikeA,
class MatrixLikeTaus,
class TriggerSender,
class CommSender>
+ 591 void computePanelReflectors(TriggerSender&& trigger, comm::IndexT_MPI rank_v0,
+ 592 CommSender&& mpi_col_chain_panel, MatrixLikeA& mat_a,
+ 593 MatrixLikeTaus& mat_taus, SizeType j_sub,
+ 594 const matrix::SubPanelView& panel_view) {
+ 595 static Device constexpr D = MatrixLikeA::device;
+ 596 using T =
typename MatrixLikeA::ElementType;
+ 597 namespace ex = pika::execution::experimental;
+ 598 namespace di = dlaf::internal;
+
+ 600 std::vector<matrix::ReadWriteTileSender<T, D>> panel_tiles;
+ 601 panel_tiles.reserve(
to_sizet(std::distance(panel_view.iteratorLocal().begin(),
+ 602 panel_view.iteratorLocal().end())));
+ 603 for (
const auto& i : panel_view.iteratorLocal()) {
+ 604 const matrix::SubTileSpec& spec = panel_view(i);
+ 605 panel_tiles.emplace_back(matrix::splitTile(mat_a.readwrite(i), spec));
+
+
+ 608 const std::size_t nthreads = getReductionToBandPanelNWorkers();
+
+ 610 ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nthreads),
+ 611 std::vector<common::internal::vector<T>>{}),
+ 612 mat_taus.readwrite(GlobalTileIndex(j_sub, 0)),
+ 613 ex::when_all_vector(std::move(panel_tiles)),
+ 614 std::forward<CommSender>(mpi_col_chain_panel), std::forward<TriggerSender>(trigger)) |
+ 615 ex::transfer(di::getBackendScheduler<Backend::MC>(pika::execution::thread_priority::high)) |
+ 616 ex::bulk(nthreads, [nthreads, rank_v0,
+ 617 cols = panel_view.cols()](
const std::size_t index,
auto& barrier_ptr,
auto& w,
+ 618 auto& taus,
auto& tiles,
auto&& pcomm) {
+ 619 const bool rankHasHead = rank_v0 == pcomm.get().rank();
+
+ 621 const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
+ 622 const std::size_t batch_size = util::ceilDiv(tiles.size(), nthreads);
+ 623 const std::size_t begin = index * batch_size;
+ 624 const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
+ 625 const SizeType nrefls = taus.size().rows();
+
+
+
+
+
+ 631 for (SizeType j = 0; j < nrefls; ++j) {
+
+
+ 634 const bool has_head = rankHasHead;
+ 635 taus({j, 0}) = computeReflector(has_head, pcomm.get(), tiles, j);
+
+ 637 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+
+ 640 const SizeType pt_cols = cols - (j + 1);
+
+
+
+ 644 const bool has_head = rankHasHead && (index == 0);
+
+ 646 w[index] = common::internal::vector<T>(pt_cols, 0);
+ 647 computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
+ 648 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+
+
+ 652 dlaf::eigensolver::internal::reduceColumnVectors(w);
+ 653 comm::sync::allReduceInPlace(pcomm.get(), MPI_SUM, common::make_data(w[0].data(), pt_cols));
+
+ 655 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+
+ 658 updateTrailingPanel(has_head, tiles, j, w[0], taus({j, 0}), begin, end);
+ 659 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+
+ 662 ex::start_detached(std::move(s));
+
+
+ 665 template <Backend B, Device D,
class T>
+ 666 void hemmComputeX(comm::IndexT_MPI reducer_col, matrix::Panel<Coord::Col, T, D>& x,
+ 667 matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>& xt,
+ 668 const matrix::SubMatrixView& view, matrix::Matrix<const T, D>& a,
+ 669 matrix::Panel<Coord::Col, const T, D>& w,
+ 670 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& wt,
+ 671 common::Pipeline<comm::Communicator>& mpi_row_chain,
+ 672 common::Pipeline<comm::Communicator>& mpi_col_chain) {
+ 673 namespace ex = pika::execution::experimental;
+
+ 675 using pika::execution::thread_priority;
+
+ 677 const auto dist = a.distribution();
+ 678 const auto rank = dist.rankIndex();
+
+
+
+
+
+ 684 matrix::util::set0<B>(thread_priority::high, x);
+ 685 matrix::util::set0<B>(thread_priority::high, xt);
+
+ 687 const LocalTileIndex at_offset = view.begin();
+
+ 689 for (SizeType i = at_offset.row(); i < dist.localNrTiles().rows(); ++i) {
+ 690 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
+ 691 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
+ 692 for (SizeType j = limit - 1; j >= at_offset.col(); --j) {
+ 693 const LocalTileIndex ij_local{i, j};
+ 694 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
+
+ 696 const bool is_diagonal_tile = (ij.row() == ij.col());
+
+ 698 auto tile_a =
splitTile(a.read(ij), view(ij_local));
+
+ 700 if (is_diagonal_tile) {
+ 701 hemmDiag<B>(thread_priority::high, std::move(tile_a), w.read(ij_local), x.readwrite(ij_local));
+
+
+
+
+
+
+
+
+ 710 hemmOffDiag<B>(thread_priority::high, blas::Op::NoTrans, tile_a, wt.read(ij_local),
+ 711 x.readwrite(ij_local));
+
+
+
+
+
+
+
+
+
+ 721 const auto owner = dist.template rankGlobalTile<Coord::Row>(ij.col());
+
+ 723 const LocalTileIndex index_x{dist.template localTileFromGlobalTile<Coord::Row>(ij.col()), 0};
+ 724 const LocalTileIndex index_xt{0, ij_local.col()};
+
+ 726 auto tile_x = (dist.rankIndex().row() == owner) ? x.readwrite(index_x) : xt.readwrite(index_xt);
+
+ 728 hemmOffDiag<B>(thread_priority::high, blas::Op::ConjTrans, std::move(tile_a), w.read(ij_local),
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 744 for (
const auto& index_xt : xt.iteratorLocal()) {
+ 745 const auto index_k = dist.template globalTileFromLocalTile<Coord::Col>(index_xt.col());
+ 746 const auto rank_owner_row = dist.template rankGlobalTile<Coord::Row>(index_k);
+
+ 748 if (rank_owner_row == rank.row()) {
+
+
+
+
+
+
+ 755 const auto i = dist.template localTileFromGlobalTile<Coord::Row>(index_k);
+ 756 ex::start_detached(comm::scheduleReduceRecvInPlace(mpi_col_chain(), MPI_SUM, x.readwrite({i, 0})));
+
+
+ 759 ex::start_detached(comm::scheduleReduceSend(mpi_col_chain(), rank_owner_row, MPI_SUM,
+
+
+
+
+
+
+
+
+ 768 for (
const auto& index_x : x.iteratorLocal()) {
+ 769 if (reducer_col == rank.col())
+ 770 ex::start_detached(comm::scheduleReduceRecvInPlace(mpi_row_chain(), MPI_SUM,
+ 771 x.readwrite(index_x)));
+
+ 773 ex::start_detached(comm::scheduleReduceSend(mpi_row_chain(), reducer_col, MPI_SUM,
+
+
+
+
+ 778 template <Backend B, Device D,
class T>
+ 779 void her2kUpdateTrailingMatrix(
const matrix::SubMatrixView& view, Matrix<T, D>& a,
+ 780 matrix::Panel<Coord::Col, const T, D>& x,
+ 781 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& vt,
+ 782 matrix::Panel<Coord::Col, const T, D>& v,
+ 783 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& xt) {
+ 784 static_assert(std::is_signed_v<BaseType<T>>,
"alpha in computations requires to be -1");
+
+ 786 using pika::execution::thread_priority;
+
+ 788 const auto dist = a.distribution();
+
+ 790 const LocalTileIndex at_start = view.begin();
+
+ 792 for (SizeType i = at_start.row(); i < dist.localNrTiles().rows(); ++i) {
+ 793 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
+ 794 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
+ 795 for (SizeType j = at_start.col(); j < limit; ++j) {
+ 796 const LocalTileIndex ij_local{i, j};
+ 797 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
+
+ 799 const bool is_diagonal_tile = (ij.row() == ij.col());
+
+ 801 auto getSubA = [&a, &view, ij_local]() {
+ 802 return splitTile(a.readwrite(ij_local), view(ij_local));
+
+
+
+
+ 807 const auto priority = (j == at_start.col()) ? thread_priority::high : thread_priority::normal;
+
+ 809 if (is_diagonal_tile) {
+ 810 her2kDiag<B>(priority, v.read(ij_local), x.read(ij_local), getSubA());
+
+
+
+ 814 her2kOffDiag<B>(priority, x.read(ij_local), vt.read(ij_local), getSubA());
+
+
+ 817 her2kOffDiag<B>(priority, v.read(ij_local), xt.read(ij_local), getSubA());
+
+
+
+
-
-
- 825 template <Backend B, Device D,
class T>
-
-
-
-
-
-
-
-
- 834 using red2band::local::computePanelReflectors;
- 835 computePanelReflectors(mat_a, mat_taus, j_sub, panel_view);
-
-
- 838 template <Device D,
class CommSender,
class TriggerSender>
- 839 void call(TriggerSender&& trigger,
comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
-
-
- 842 using red2band::distributed::computePanelReflectors;
- 843 computePanelReflectors(std::forward<TriggerSender>(trigger), rank_v0,
- 844 std::forward<CommSender>(mpi_col_chain_panel), mat_a, mat_taus, j_sub,
-
-
-
-
-
-
-
-
- 853 : panels_v(n_workspaces, dist_a) {}
-
-
-
- 857 using red2band::local::computePanelReflectors;
-
- 859 namespace ex = pika::execution::experimental;
-
-
-
-
-
-
- 866 auto& v = panels_v.nextResource();
-
- 868 copyToCPU(panel_view, mat_a, v);
- 869 computePanelReflectors(v, mat_taus, j_sub, panel_view);
- 870 copyFromCPU(panel_view, v, mat_a);
-
-
- 873 template <Device D,
class CommSender,
class TriggerSender>
- 874 void call(TriggerSender&& trigger,
comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
-
-
- 877 auto& v = panels_v.nextResource();
-
-
- 880 copyToCPU(panel_view, mat_a, v);
-
-
- 883 using dlaf::eigensolver::internal::red2band::distributed::computePanelReflectors;
- 884 computePanelReflectors(std::forward<TriggerSender>(trigger), rank_v0,
- 885 std::forward<CommSender>(mpi_col_chain_panel), v, mat_taus, j_sub,
-
-
-
- 889 copyFromCPU(panel_view, v, mat_a);
-
-
-
-
-
-
-
- 897 namespace ex = pika::execution::experimental;
-
-
- 900 using dlaf::matrix::internal::CopyBackend_v;
- 901 using pika::execution::thread_priority;
-
-
- 904 auto spec = panel_view(i);
-
-
- 907 ex::when_all(splitTile(mat_a.read(i), spec), splitTile(std::move(tmp_tile), spec)) |
- 908 matrix::copy(Policy<CopyBackend_v<Device::GPU, Device::CPU>>(thread_priority::high)));
-
-
-
-
-
- 914 namespace ex = pika::execution::experimental;
-
-
- 917 using dlaf::matrix::internal::CopyBackend_v;
- 918 using pika::execution::thread_priority;
-
-
- 921 auto spec = panel_view(i);
-
-
- 924 ex::when_all(splitTile(v.read(i), spec), splitTile(std::move(tile_a), spec)) |
- 925 matrix::copy(Policy<CopyBackend_v<Device::CPU, Device::GPU>>(thread_priority::high)));
-
-
-
-
-
-
-
-
- 934 template <Backend B, Device D,
class T>
-
-
-
-
- 939 using namespace red2band::local;
-
- 941 using common::iterate_range2d;
- 942 using factorization::internal::computeTFactor;
-
- 944 using pika::execution::experimental::any_sender;
-
- 946 const auto dist_a = mat_a.distribution();
-
- 948 {dist_a.blockSize().rows(), band_size});
-
-
-
- 952 const SizeType nrefls = std::max<SizeType>(0, dist_a.size().rows() - band_size - 1);
-
-
- 955 DLAF_ASSERT(mat_a.blockSize().cols() % band_size == 0, mat_a.blockSize().cols(), band_size);
-
-
-
-
-
-
-
-
-
-
- 966 mat_taus,
LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
-
- 968 const SizeType ntiles = (nrefls - 1) / band_size + 1;
- 969 DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());
-
- 971 const bool is_full_band = (band_size == dist_a.blockSize().cols());
-
- 973 constexpr std::size_t n_workspaces = 2;
-
-
-
-
-
-
-
-
-
-
-
- 985 red2band::ComputePanelHelper<B, D, T> compute_panel_helper(n_workspaces, dist_a);
-
- 987 for (SizeType j_sub = 0; j_sub < ntiles; ++j_sub) {
- 988 const auto i_sub = j_sub + 1;
-
-
-
- 992 const SizeType nrefls_tile = mat_taus_retiled.tileSize(
GlobalTileIndex(j_sub, 0)).rows();
-
- 994 const bool isPanelIncomplete = (nrefls_tile != band_size);
-
-
- 997 DLAF_ASSERT_HEAVY(nrefls_tile != 0, nrefls_tile);
-
-
-
-
-
- 1003 Panel<Coord::Col, T, D>& v = panels_v.nextResource();
- 1004 v.setRangeStart(ij_offset);
- 1005 if (isPanelIncomplete)
- 1006 v.setWidth(nrefls_tile);
-
-
- 1009 compute_panel_helper.call(mat_a, mat_taus_retiled, j_sub, panel_view);
-
-
-
-
-
- 1015 constexpr
bool has_reflector_head =
true;
- 1016 setupReflectorPanelV<B, D, T>(has_reflector_head, panel_view, nrefls_tile, v, mat_a, !is_full_band);
-
-
-
-
- 1021 Matrix<T, D> t({nrefls_tile, nrefls_tile}, dist.blockSize());
-
- 1023 computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx));
-
-
- 1026 const GlobalElementIndex at_offset(ij_offset + GlobalElementSize(0, band_size));
-
-
- 1029 if (!at_offset.isIn(mat_a.size()))
-
-
- 1032 const matrix::SubMatrixView trailing_matrix_view(dist_a, at_offset);
-
-
- 1035 Panel<Coord::Col, T, D>& w = panels_w.nextResource();
- 1036 w.setRangeStart(at_offset);
- 1037 if (isPanelIncomplete)
- 1038 w.setWidth(nrefls_tile);
-
- 1040 trmmComputeW<B>(w, v, t.read(t_idx));
-
-
- 1043 Panel<Coord::Col, T, D>& x = panels_x.nextResource();
- 1044 x.setRangeStart(at_offset);
- 1045 if (isPanelIncomplete)
- 1046 x.setWidth(nrefls_tile);
-
-
-
-
-
- 1052 hemmComputeX<B>(x, trailing_matrix_view, mat_a, w);
-
-
-
-
-
-
-
- 1060 Matrix<T, D> w2 = std::move(t);
-
- 1062 gemmComputeW2<B>(w2, w, x);
- 1063 gemmUpdateX<B>(x, w2, v);
-
-
-
-
- 1068 her2kUpdateTrailingMatrix<B>(trailing_matrix_view, mat_a, x, v);
-
-
-
-
-
-
-
-
-
-
- 1079 template <Backend B, Device D,
class T>
- 1080 Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid grid, Matrix<T, D>& mat_a,
- 1081 const SizeType band_size) {
- 1082 using namespace red2band::distributed;
-
- 1084 using common::iterate_range2d;
- 1085 using factorization::internal::computeTFactor;
-
- 1087 namespace ex = pika::execution::experimental;
-
-
-
-
- 1092 pika::threads::get_thread_manager().wait();
-
- 1094 common::Pipeline<comm::Communicator> mpi_col_chain_panel(grid.colCommunicator().clone());
- 1095 common::Pipeline<comm::Communicator> mpi_row_chain(grid.rowCommunicator().clone());
- 1096 common::Pipeline<comm::Communicator> mpi_col_chain(grid.colCommunicator().clone());
-
- 1098 const auto& dist = mat_a.distribution();
- 1099 const comm::Index2D rank = dist.rankIndex();
-
-
-
- 1103 const SizeType nrefls = std::max<SizeType>(0, dist.size().rows() - band_size - 1);
-
-
- 1106 DLAF_ASSERT(mat_a.blockSize().cols() % band_size == 0, mat_a.blockSize().cols(), band_size);
- 1107 Matrix<T, Device::CPU> mat_taus(matrix::Distribution(GlobalElementSize(nrefls, 1),
- 1108 TileElementSize(mat_a.blockSize().cols(), 1),
- 1109 comm::Size2D(mat_a.commGridSize().cols(), 1),
- 1110 comm::Index2D(mat_a.rankIndex().col(), 0),
- 1111 comm::Index2D(mat_a.sourceRankIndex().col(), 0)));
-
-
-
-
- 1116 matrix::RetiledMatrix<T, Device::CPU> mat_taus_retiled(
- 1117 mat_taus, LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
-
- 1119 const SizeType ntiles = (nrefls - 1) / band_size + 1;
- 1120 DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());
-
- 1122 const bool is_full_band = (band_size == dist.blockSize().cols());
-
- 1124 constexpr std::size_t n_workspaces = 2;
- 1125 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_v(n_workspaces, dist);
- 1126 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_vt(
- 1127 n_workspaces, dist);
-
- 1129 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_w(n_workspaces, dist);
- 1130 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_wt(
- 1131 n_workspaces, dist);
-
- 1133 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_x(n_workspaces, dist);
- 1134 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_xt(
- 1135 n_workspaces, dist);
-
- 1137 red2band::ComputePanelHelper<B, D, T> compute_panel_helper(n_workspaces, dist);
-
- 1139 ex::unique_any_sender<> trigger_panel{ex::just()};
- 1140 for (SizeType j_sub = 0; j_sub < ntiles; ++j_sub) {
- 1141 const SizeType i_sub = j_sub + 1;
-
- 1143 const GlobalElementIndex ij_offset(i_sub * band_size, j_sub * band_size);
- 1144 const GlobalElementIndex at_offset(i_sub * band_size, (j_sub + 1) * band_size);
-
- 1146 const comm::Index2D rank_v0{
- 1147 dist.template rankGlobalElement<Coord::Row>(ij_offset.row()),
- 1148 dist.template rankGlobalElement<Coord::Col>(ij_offset.col()),
-
-
- 1151 const bool is_panel_rank_col = rank_v0.col() == rank.col();
-
- 1153 const SizeType nrefls_tile = mat_taus_retiled.tileSize(GlobalTileIndex(j_sub, 0)).rows();
-
- 1155 if (nrefls_tile == 0)
-
-
- 1158 auto& v = panels_v.nextResource();
- 1159 auto& vt = panels_vt.nextResource();
-
- 1161 v.setRangeStart(at_offset);
- 1162 vt.setRangeStart(at_offset);
-
- 1164 v.setWidth(nrefls_tile);
- 1165 vt.setHeight(nrefls_tile);
-
- 1167 const LocalTileIndex t_idx(0, 0);
-
-
- 1170 matrix::Matrix<T, D> t({nrefls_tile, nrefls_tile}, dist.blockSize());
-
-
- 1173 const matrix::SubPanelView panel_view(dist, ij_offset, band_size);
-
- 1175 if (is_panel_rank_col) {
- 1176 compute_panel_helper.call(std::move(trigger_panel), rank_v0.row(), mpi_col_chain_panel(), mat_a,
- 1177 mat_taus_retiled, j_sub, panel_view);
-
-
-
-
-
- 1183 red2band::local::setupReflectorPanelV<B, D, T>(rank.row() == rank_v0.row(), panel_view,
- 1184 nrefls_tile, v, mat_a, !is_full_band);
- 1185 computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx),
-
-
-
-
-
-
- 1192 if (!at_offset.isIn(mat_a.size()))
-
-
- 1195 const matrix::SubMatrixView trailing_matrix_view(dist, at_offset);
-
- 1197 comm::broadcast(rank_v0.col(), v, vt, mpi_row_chain, mpi_col_chain);
-
-
- 1200 auto& w = panels_w.nextResource();
- 1201 auto& wt = panels_wt.nextResource();
-
- 1203 w.setRangeStart(at_offset);
- 1204 wt.setRangeStart(at_offset);
-
- 1206 w.setWidth(nrefls_tile);
- 1207 wt.setHeight(nrefls_tile);
-
- 1209 if (is_panel_rank_col)
- 1210 red2band::local::trmmComputeW<B, D>(w, v, t.read(t_idx));
-
- 1212 comm::broadcast(rank_v0.col(), w, wt, mpi_row_chain, mpi_col_chain);
-
-
- 1215 auto& x = panels_x.nextResource();
- 1216 auto& xt = panels_xt.nextResource();
-
- 1218 x.setRangeStart(at_offset);
- 1219 xt.setRangeStart(at_offset);
-
- 1221 x.setWidth(nrefls_tile);
- 1222 xt.setHeight(nrefls_tile);
-
-
-
-
-
-
-
-
-
- 1232 hemmComputeX<B, D>(rank_v0.col(), x, xt, trailing_matrix_view, mat_a, w, wt, mpi_row_chain,
-
-
-
-
-
-
-
-
-
- 1242 if (is_panel_rank_col) {
-
-
- 1245 matrix::Matrix<T, D> w2 = std::move(t);
-
- 1247 red2band::local::gemmComputeW2<B, D>(w2, w, x);
- 1248 ex::start_detached(comm::scheduleAllReduceInPlace(mpi_col_chain(), MPI_SUM,
- 1249 w2.readwrite(LocalTileIndex(0, 0))));
-
- 1251 red2band::local::gemmUpdateX<B, D>(x, w2, v);
-
-
-
-
-
-
- 1258 xt.setRangeStart(at_offset);
- 1259 xt.setHeight(nrefls_tile);
-
- 1261 comm::broadcast(rank_v0.col(), x, xt, mpi_row_chain, mpi_col_chain);
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 1305 const SizeType j_tile_current = ij_offset.col() / dist.blockSize().cols();
- 1306 const SizeType j_tile_next = at_offset.col() / dist.blockSize().cols();
- 1307 const bool isNextColumnOnSameRank = (j_tile_current == j_tile_next);
- 1308 const comm::IndexT_MPI rank_next_col =
- 1309 isNextColumnOnSameRank ? rank_v0.col() : (rank_v0.col() + 1) % dist.commGridSize().cols();
-
- 1311 if (rank.col() == rank_next_col) {
- 1312 const LocalTileIndex at{
- 1313 dist.template nextLocalTileFromGlobalElement<Coord::Row>(at_offset.row()),
- 1314 dist.template nextLocalTileFromGlobalElement<Coord::Col>(at_offset.col()),
-
-
-
-
-
-
-
-
- 1323 const SizeType at_tile_col =
- 1324 dist.template globalTileFromGlobalElement<Coord::Col>(at_offset.col());
-
- 1326 if (at_tile_col == dist.nrTiles().cols() - 1) {
- 1327 const comm::IndexT_MPI owner = rank_v0.row();
- 1328 if (rank.row() == owner) {
- 1329 xt.setTile(at, x.read(at));
-
- 1331 if (dist.commGridSize().rows() > 1)
- 1332 ex::start_detached(comm::scheduleSendBcast(mpi_col_chain(), xt.read(at)));
-
-
- 1335 if (dist.commGridSize().rows() > 1)
- 1336 ex::start_detached(comm::scheduleRecvBcast(mpi_col_chain(), owner, xt.readwrite(at)));
-
-
-
- 1340 if constexpr (dlaf::comm::CommunicationDevice_v<D> == D) {
-
-
-
- 1344 if (rank.row() == rank_v0.row()) {
- 1345 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
-
-
-
-
-
-
- 1352 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
-
-
-
- 1356 if (rank.row() == rank_v0.row()) {
-
-
-
-
-
-
-
-
- 1365 trigger_panel = x.readwrite(at) | ex::drop_value() | ex::ensure_started();
-
-
-
-
-
-
- 1372 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
-
-
-
-
-
- 1378 her2kUpdateTrailingMatrix<B>(trailing_matrix_view, mat_a, x, vt, v, xt);
-
-
-
-
-
-
-
-
-
-
+
+ 824 template <Backend B, Device D,
class T>
+
+
+
+
+
+
+ 831 void call(Matrix<T, Device::CPU>& mat_a, Matrix<T, Device::CPU>& mat_taus,
const SizeType j_sub,
+
+ 833 using red2band::local::computePanelReflectors;
+ 834 computePanelReflectors(mat_a, mat_taus, j_sub, panel_view);
+
+
+ 837 template <Device D,
class CommSender,
class TriggerSender>
+ 838 void call(TriggerSender&& trigger,
comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
+ 839 Matrix<T, D>& mat_a, Matrix<T, Device::CPU>& mat_taus,
const SizeType j_sub,
+
+ 841 using red2band::distributed::computePanelReflectors;
+ 842 computePanelReflectors(std::forward<TriggerSender>(trigger), rank_v0,
+ 843 std::forward<CommSender>(mpi_col_chain_panel), mat_a, mat_taus, j_sub,
+
+
+
+
+
+
+
+
+ 852 : panels_v(n_workspaces, dist_a) {}
+
+ 854 void call(Matrix<T, Device::GPU>& mat_a, Matrix<T, Device::CPU>& mat_taus,
const SizeType j_sub,
+
+ 856 using red2band::local::computePanelReflectors;
+
+ 858 namespace ex = pika::execution::experimental;
+
+
+
+
+
+
+ 865 auto& v = panels_v.nextResource();
+
+ 867 copyToCPU(panel_view, mat_a, v);
+ 868 computePanelReflectors(v, mat_taus, j_sub, panel_view);
+ 869 copyFromCPU(panel_view, v, mat_a);
+
+
+ 872 template <Device D,
class CommSender,
class TriggerSender>
+ 873 void call(TriggerSender&& trigger,
comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
+ 874 Matrix<T, D>& mat_a, Matrix<T, Device::CPU>& mat_taus, SizeType j_sub,
+
+ 876 auto& v = panels_v.nextResource();
+
+
+ 879 copyToCPU(panel_view, mat_a, v);
+
+
+ 882 using dlaf::eigensolver::internal::red2band::distributed::computePanelReflectors;
+ 883 computePanelReflectors(std::forward<TriggerSender>(trigger), rank_v0,
+ 884 std::forward<CommSender>(mpi_col_chain_panel), v, mat_taus, j_sub,
+
+
+
+ 888 copyFromCPU(panel_view, v, mat_a);
+
+
+
+
+
+
+
+ 896 namespace ex = pika::execution::experimental;
+
+
+ 899 using dlaf::matrix::internal::CopyBackend_v;
+ 900 using pika::execution::thread_priority;
+
+
+ 903 auto spec = panel_view(i);
+
+
+ 906 ex::when_all(splitTile(mat_a.read(i), spec), splitTile(std::move(tmp_tile), spec)) |
+ 907 matrix::copy(Policy<CopyBackend_v<Device::GPU, Device::CPU>>(thread_priority::high)));
+
+
+
+
+
+ 913 namespace ex = pika::execution::experimental;
+
+
+ 916 using dlaf::matrix::internal::CopyBackend_v;
+ 917 using pika::execution::thread_priority;
+
+
+ 920 auto spec = panel_view(i);
+
+
+ 923 ex::when_all(splitTile(v.read(i), spec), splitTile(std::move(tile_a), spec)) |
+ 924 matrix::copy(Policy<CopyBackend_v<Device::CPU, Device::GPU>>(thread_priority::high)));
+
+
+
+
+
+
+
+
+ 933 template <Backend B, Device D,
class T>
+
+
+
+
+ 938 using namespace red2band::local;
+
+ 940 using common::iterate_range2d;
+ 941 using factorization::internal::computeTFactor;
+
+ 943 using pika::execution::experimental::any_sender;
+
+ 945 const auto dist_a = mat_a.distribution();
+
+ 947 {dist_a.blockSize().rows(), band_size});
+
+
+
+ 951 const SizeType nrefls = std::max<SizeType>(0, dist_a.size().rows() - band_size - 1);
+
+
+ 954 DLAF_ASSERT(mat_a.blockSize().cols() % band_size == 0, mat_a.blockSize().cols(), band_size);
+
+
+
+
+
+
+
+
+
+ 964 Matrix<T, Device::CPU> mat_taus_retiled =
+ 965 mat_taus.retiledSubPipeline(
LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
+
+ 967 const SizeType ntiles = (nrefls - 1) / band_size + 1;
+ 968 DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());
+
+ 970 const bool is_full_band = (band_size == dist_a.blockSize().cols());
+
+ 972 constexpr std::size_t n_workspaces = 2;
+
+
+
+
+
+
+
+
+
+
+
+ 984 red2band::ComputePanelHelper<B, D, T> compute_panel_helper(n_workspaces, dist_a);
+
+ 986 for (SizeType j_sub = 0; j_sub < ntiles; ++j_sub) {
+ 987 const auto i_sub = j_sub + 1;
+
+
+
+ 991 const SizeType nrefls_tile = mat_taus_retiled.tileSize(
GlobalTileIndex(j_sub, 0)).rows();
+
+ 993 const bool isPanelIncomplete = (nrefls_tile != band_size);
+
+
+ 996 DLAF_ASSERT_HEAVY(nrefls_tile != 0, nrefls_tile);
+
+
+
+
+
+ 1002 Panel<Coord::Col, T, D>& v = panels_v.nextResource();
+ 1003 v.setRangeStart(ij_offset);
+ 1004 if (isPanelIncomplete)
+ 1005 v.setWidth(nrefls_tile);
+
+
+ 1008 compute_panel_helper.call(mat_a, mat_taus_retiled, j_sub, panel_view);
+
+
+
+
+
+ 1014 constexpr
bool has_reflector_head =
true;
+ 1015 setupReflectorPanelV<B, D, T>(has_reflector_head, panel_view, nrefls_tile, v, mat_a, !is_full_band);
+
+
+
+
+ 1020 Matrix<T, D> t({nrefls_tile, nrefls_tile}, dist.blockSize());
+
+ 1022 computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx));
+
+
+ 1025 const GlobalElementIndex at_offset(ij_offset + GlobalElementSize(0, band_size));
+
+
+ 1028 if (!at_offset.isIn(mat_a.size()))
+
+
+ 1031 const matrix::SubMatrixView trailing_matrix_view(dist_a, at_offset);
+
+
+ 1034 Panel<Coord::Col, T, D>& w = panels_w.nextResource();
+ 1035 w.setRangeStart(at_offset);
+ 1036 if (isPanelIncomplete)
+ 1037 w.setWidth(nrefls_tile);
+
+ 1039 trmmComputeW<B>(w, v, t.read(t_idx));
+
+
+ 1042 Panel<Coord::Col, T, D>& x = panels_x.nextResource();
+ 1043 x.setRangeStart(at_offset);
+ 1044 if (isPanelIncomplete)
+ 1045 x.setWidth(nrefls_tile);
+
+
+
+
+
+ 1051 hemmComputeX<B>(x, trailing_matrix_view, mat_a, w);
+
+
+
+
+
+
+
+ 1059 Matrix<T, D> w2 = std::move(t);
+
+ 1061 gemmComputeW2<B>(w2, w, x);
+ 1062 gemmUpdateX<B>(x, w2, v);
+
+
+
+
+ 1067 her2kUpdateTrailingMatrix<B>(trailing_matrix_view, mat_a, x, v);
+
+
+
+
+
+
+
+
+
+
+ 1078 template <Backend B, Device D,
class T>
+ 1079 Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid grid, Matrix<T, D>& mat_a,
+ 1080 const SizeType band_size) {
+ 1081 using namespace red2band::distributed;
+
+ 1083 using common::iterate_range2d;
+ 1084 using factorization::internal::computeTFactor;
+
+ 1086 namespace ex = pika::execution::experimental;
+
+
+
+
+ 1091 pika::threads::get_thread_manager().wait();
+
+ 1093 common::Pipeline<comm::Communicator> mpi_col_chain_panel(grid.colCommunicator().clone());
+ 1094 common::Pipeline<comm::Communicator> mpi_row_chain(grid.rowCommunicator().clone());
+ 1095 common::Pipeline<comm::Communicator> mpi_col_chain(grid.colCommunicator().clone());
+
+ 1097 const auto& dist = mat_a.distribution();
+ 1098 const comm::Index2D rank = dist.rankIndex();
+
+
+
+ 1102 const SizeType nrefls = std::max<SizeType>(0, dist.size().rows() - band_size - 1);
+
+
+ 1105 DLAF_ASSERT(mat_a.blockSize().cols() % band_size == 0, mat_a.blockSize().cols(), band_size);
+ 1106 Matrix<T, Device::CPU> mat_taus(matrix::Distribution(GlobalElementSize(nrefls, 1),
+ 1107 TileElementSize(mat_a.blockSize().cols(), 1),
+ 1108 comm::Size2D(mat_a.commGridSize().cols(), 1),
+ 1109 comm::Index2D(mat_a.rankIndex().col(), 0),
+ 1110 comm::Index2D(mat_a.sourceRankIndex().col(), 0)));
+
+
+
+
+ 1115 Matrix<T, Device::CPU> mat_taus_retiled =
+ 1116 mat_taus.retiledSubPipeline(LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
+
+ 1118 const SizeType ntiles = (nrefls - 1) / band_size + 1;
+ 1119 DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());
+
+ 1121 const bool is_full_band = (band_size == dist.blockSize().cols());
+
+ 1123 constexpr std::size_t n_workspaces = 2;
+ 1124 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_v(n_workspaces, dist);
+ 1125 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_vt(
+ 1126 n_workspaces, dist);
+
+ 1128 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_w(n_workspaces, dist);
+ 1129 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_wt(
+ 1130 n_workspaces, dist);
+
+ 1132 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_x(n_workspaces, dist);
+ 1133 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_xt(
+ 1134 n_workspaces, dist);
+
+ 1136 red2band::ComputePanelHelper<B, D, T> compute_panel_helper(n_workspaces, dist);
+
+ 1138 ex::unique_any_sender<> trigger_panel{ex::just()};
+ 1139 for (SizeType j_sub = 0; j_sub < ntiles; ++j_sub) {
+ 1140 const SizeType i_sub = j_sub + 1;
+
+ 1142 const GlobalElementIndex ij_offset(i_sub * band_size, j_sub * band_size);
+ 1143 const GlobalElementIndex at_offset(i_sub * band_size, (j_sub + 1) * band_size);
+
+ 1145 const comm::Index2D rank_v0{
+ 1146 dist.template rankGlobalElement<Coord::Row>(ij_offset.row()),
+ 1147 dist.template rankGlobalElement<Coord::Col>(ij_offset.col()),
+
+
+ 1150 const bool is_panel_rank_col = rank_v0.col() == rank.col();
+
+ 1152 const SizeType nrefls_tile = mat_taus_retiled.tileSize(GlobalTileIndex(j_sub, 0)).rows();
+
+ 1154 if (nrefls_tile == 0)
+
+
+ 1157 auto& v = panels_v.nextResource();
+ 1158 auto& vt = panels_vt.nextResource();
+
+ 1160 v.setRangeStart(at_offset);
+ 1161 vt.setRangeStart(at_offset);
+
+ 1163 v.setWidth(nrefls_tile);
+ 1164 vt.setHeight(nrefls_tile);
+
+ 1166 const LocalTileIndex t_idx(0, 0);
+
+
+ 1169 matrix::Matrix<T, D> t({nrefls_tile, nrefls_tile}, dist.blockSize());
+
+
+ 1172 const matrix::SubPanelView panel_view(dist, ij_offset, band_size);
+
+ 1174 if (is_panel_rank_col) {
+ 1175 compute_panel_helper.call(std::move(trigger_panel), rank_v0.row(), mpi_col_chain_panel(), mat_a,
+ 1176 mat_taus_retiled, j_sub, panel_view);
+
+
+
+
+
+ 1182 red2band::local::setupReflectorPanelV<B, D, T>(rank.row() == rank_v0.row(), panel_view,
+ 1183 nrefls_tile, v, mat_a, !is_full_band);
+ 1184 computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx),
+
+
+
+
+
+
+ 1191 if (!at_offset.isIn(mat_a.size()))
+
+
+ 1194 const matrix::SubMatrixView trailing_matrix_view(dist, at_offset);
+
+ 1196 comm::broadcast(rank_v0.col(), v, vt, mpi_row_chain, mpi_col_chain);
+
+
+ 1199 auto& w = panels_w.nextResource();
+ 1200 auto& wt = panels_wt.nextResource();
+
+ 1202 w.setRangeStart(at_offset);
+ 1203 wt.setRangeStart(at_offset);
+
+ 1205 w.setWidth(nrefls_tile);
+ 1206 wt.setHeight(nrefls_tile);
+
+ 1208 if (is_panel_rank_col)
+ 1209 red2band::local::trmmComputeW<B, D>(w, v, t.read(t_idx));
+
+ 1211 comm::broadcast(rank_v0.col(), w, wt, mpi_row_chain, mpi_col_chain);
+
+
+ 1214 auto& x = panels_x.nextResource();
+ 1215 auto& xt = panels_xt.nextResource();
+
+ 1217 x.setRangeStart(at_offset);
+ 1218 xt.setRangeStart(at_offset);
+
+ 1220 x.setWidth(nrefls_tile);
+ 1221 xt.setHeight(nrefls_tile);
+
+
+
+
+
+
+
+
+
+ 1231 hemmComputeX<B, D>(rank_v0.col(), x, xt, trailing_matrix_view, mat_a, w, wt, mpi_row_chain,
+
+
+
+
+
+
+
+
+
+ 1241 if (is_panel_rank_col) {
+
+
+ 1244 matrix::Matrix<T, D> w2 = std::move(t);
+
+ 1246 red2band::local::gemmComputeW2<B, D>(w2, w, x);
+ 1247 ex::start_detached(comm::scheduleAllReduceInPlace(mpi_col_chain(), MPI_SUM,
+ 1248 w2.readwrite(LocalTileIndex(0, 0))));
+
+ 1250 red2band::local::gemmUpdateX<B, D>(x, w2, v);
+
+
+
+
+
+
+ 1257 xt.setRangeStart(at_offset);
+ 1258 xt.setHeight(nrefls_tile);
+
+ 1260 comm::broadcast(rank_v0.col(), x, xt, mpi_row_chain, mpi_col_chain);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1304 const SizeType j_tile_current = ij_offset.col() / dist.blockSize().cols();
+ 1305 const SizeType j_tile_next = at_offset.col() / dist.blockSize().cols();
+ 1306 const bool isNextColumnOnSameRank = (j_tile_current == j_tile_next);
+ 1307 const comm::IndexT_MPI rank_next_col =
+ 1308 isNextColumnOnSameRank ? rank_v0.col() : (rank_v0.col() + 1) % dist.commGridSize().cols();
+
+ 1310 if (rank.col() == rank_next_col) {
+ 1311 const LocalTileIndex at{
+ 1312 dist.template nextLocalTileFromGlobalElement<Coord::Row>(at_offset.row()),
+ 1313 dist.template nextLocalTileFromGlobalElement<Coord::Col>(at_offset.col()),
+
+
+
+
+
+
+
+
+ 1322 const SizeType at_tile_col =
+ 1323 dist.template globalTileFromGlobalElement<Coord::Col>(at_offset.col());
+
+ 1325 if (at_tile_col == dist.nrTiles().cols() - 1) {
+ 1326 const comm::IndexT_MPI owner = rank_v0.row();
+ 1327 if (rank.row() == owner) {
+ 1328 xt.setTile(at, x.read(at));
+
+ 1330 if (dist.commGridSize().rows() > 1)
+ 1331 ex::start_detached(comm::scheduleSendBcast(mpi_col_chain(), xt.read(at)));
+
+
+ 1334 if (dist.commGridSize().rows() > 1)
+ 1335 ex::start_detached(comm::scheduleRecvBcast(mpi_col_chain(), owner, xt.readwrite(at)));
+
+
+
+ 1339 if constexpr (dlaf::comm::CommunicationDevice_v<D> == D) {
+
+
+
+ 1343 if (rank.row() == rank_v0.row()) {
+ 1344 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
+
+
+
+
+
+
+ 1351 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
+
+
+
+ 1355 if (rank.row() == rank_v0.row()) {
+
+
+
+
+
+
+
+
+ 1364 trigger_panel = x.readwrite(at) | ex::drop_value() | ex::ensure_started();
+
+
+
+
+
+
+ 1371 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
+
+
+
+
+
+ 1377 her2kUpdateTrailingMatrix<B>(trailing_matrix_view, mat_a, x, vt, v, xt);
+
+
+
+
+
+
+
+
+
+
+
-
Definition: distribution.h:27
-
-ReadWriteSenderType readwrite(const LocalTileIndex &index) noexcept
Definition: matrix.h:121
-Definition: retiled_matrix.h:36
+
+ReadWriteSenderType readwrite(const LocalTileIndex &index) noexcept
Definition: matrix.h:122
int IndexT_MPI
Type used for indexes in MPI API.
Definition: communicator.h:23
@@ -1484,19 +1482,18 @@
ReadOnlyTileSender< T, D > splitTile(ReadOnlyTileSender< T, D > tile, const SubTileSpec &spec)
Definition: tile.h:507
-dlaf::BaseType< T > norm(comm::CommunicatorGrid grid, comm::Index2D rank, lapack::Norm norm_type, blas::Uplo uplo, Matrix< const T, device > &A)
Definition: norm.h:43
+dlaf::BaseType< T > norm(comm::CommunicatorGrid grid, comm::Index2D rank, lapack::Norm norm_type, blas::Uplo uplo, Matrix< const T, device > &A)
Definition: norm.h:44
-
Definition: round_robin.h:20
-
+
ReadWriteSenderType readwrite(LocalTileIndex index)
Definition: panel.h:570
diff --git a/master/files.html b/master/files.html
index cdbba16a25..a9d003ade2 100644
--- a/master/files.html
+++ b/master/files.html
@@ -216,10 +216,9 @@
print_csv.h | |
print_gpu.h | |
print_numpy.h | |
- retiled_matrix.h | |
- tile.h | |
- util_distribution.h | |
- views.h | |
+ tile.h | |
+ util_distribution.h | |
+ views.h | |
► memory | |
memory_chunk.h | |
memory_view.h | |
diff --git a/master/functions_d.html b/master/functions_d.html
index af9b81d1ca..2b1efff2e7 100644
--- a/master/functions_d.html
+++ b/master/functions_d.html
@@ -91,6 +91,9 @@ - d -
diff --git a/master/functions_func_d.html b/master/functions_func_d.html
index 7da8334928..1ae3105610 100644
--- a/master/functions_func_d.html
+++ b/master/functions_func_d.html
@@ -81,6 +81,9 @@ - d -
diff --git a/master/functions_func_r.html b/master/functions_func_r.html
index 348a38a440..eec6e1a6bc 100644
--- a/master/functions_func_r.html
+++ b/master/functions_func_r.html
@@ -97,13 +97,11 @@ - r -
: dlaf::matrix::internal::TilePipeline< T, D >
, dlaf::matrix::Matrix< const T, D >
, dlaf::matrix::Panel< axis, const T, D, StoreTransposed::No >
-, dlaf::matrix::RetiledMatrix< T, D >
- readwrite()
: dlaf::matrix::internal::TilePipeline< T, D >
, dlaf::matrix::Matrix< T, D >
, dlaf::matrix::Panel< axis, T, D, Storage >
-, dlaf::matrix::RetiledMatrix< T, D >
- readwrite_with_wrapper()
: dlaf::matrix::internal::TilePipeline< T, D >
@@ -113,8 +111,11 @@
- r -
-- RetiledMatrix()
-: dlaf::matrix::RetiledMatrix< T, D >
+
- retiledSubPipeline()
+: dlaf::matrix::Matrix< T, D >
+
+- retiledSubPipelineConst()
+: dlaf::matrix::Matrix< const T, D >
- rowCommunicator()
: dlaf::comm::CommunicatorGrid
diff --git a/master/functions_func_s.html b/master/functions_func_s.html
index 617197a5e0..34786b10a4 100644
--- a/master/functions_func_s.html
+++ b/master/functions_func_s.html
@@ -102,6 +102,12 @@
- s -
- SubPanelView()
: dlaf::matrix::SubPanelView
+- subPipeline()
+: dlaf::matrix::Matrix< T, D >
+
+- subPipelineConst()
+: dlaf::matrix::Matrix< const T, D >
+
- subTileReference()
: dlaf::matrix::Tile< T, D >
, dlaf::matrix::Tile< const T, D >
diff --git a/master/functions_func_w.html b/master/functions_func_w.html
index 413bbe4aff..69d218dfcc 100644
--- a/master/functions_func_w.html
+++ b/master/functions_func_w.html
@@ -66,7 +66,6 @@
- w -
diff --git a/master/functions_r.html b/master/functions_r.html
index 0ec72bd5c5..d2e68d5e82 100644
--- a/master/functions_r.html
+++ b/master/functions_r.html
@@ -97,13 +97,11 @@ - r -
- readwrite()
: dlaf::matrix::internal::TilePipeline< T, D >
, dlaf::matrix::Matrix< T, D >
, dlaf::matrix::Panel< axis, T, D, Storage >
-, dlaf::matrix::RetiledMatrix< T, D >
- readwrite_with_wrapper()
: dlaf::matrix::internal::TilePipeline< T, D >
@@ -113,8 +111,11 @@
- r -
-- RetiledMatrix()
-: dlaf::matrix::RetiledMatrix< T, D >
+
- retiledSubPipeline()
+: dlaf::matrix::Matrix< T, D >
+
+- retiledSubPipelineConst()
+: dlaf::matrix::Matrix< const T, D >
- rowCommunicator()
: dlaf::comm::CommunicatorGrid
diff --git a/master/functions_s.html b/master/functions_s.html
index 1444582871..bc1d3042b3 100644
--- a/master/functions_s.html
+++ b/master/functions_s.html
@@ -108,6 +108,12 @@
- s -
- SubPanelView()
: dlaf::matrix::SubPanelView
+- subPipeline()
+: dlaf::matrix::Matrix< T, D >
+
+- subPipelineConst()
+: dlaf::matrix::Matrix< const T, D >
+
- subTileReference()
: dlaf::matrix::Tile< T, D >
, dlaf::matrix::Tile< const T, D >
diff --git a/master/functions_w.html b/master/functions_w.html
index 89f0ef8197..49eb922c52 100644
--- a/master/functions_w.html
+++ b/master/functions_w.html
@@ -66,7 +66,6 @@
- w -
diff --git a/master/gen__eigensolver_8h_source.html b/master/gen__eigensolver_8h_source.html
index aa4eb0c628..e4eadf64fb 100644
--- a/master/gen__eigensolver_8h_source.html
+++ b/master/gen__eigensolver_8h_source.html
@@ -110,83 +110,91 @@
59 DLAF_ASSERT(eigenvectors.size() == mat_a.size(), eigenvectors, mat_a);
60 DLAF_ASSERT(eigenvectors.blockSize() == mat_a.blockSize(), eigenvectors, mat_a);
-
-
-
-
- 80 template <Backend B, Device D,
class T>
-
- 82 DLAF_ASSERT(matrix::local_matrix(mat_a), mat_a);
- 83 DLAF_ASSERT(matrix::local_matrix(mat_b), mat_b);
- 84 DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
- 85 DLAF_ASSERT(matrix::square_blocksize(mat_a), mat_a);
- 86 DLAF_ASSERT(matrix::square_size(mat_b), mat_b);
- 87 DLAF_ASSERT(matrix::square_blocksize(mat_b), mat_b);
- 88 DLAF_ASSERT(mat_a.size() == mat_b.size(), mat_a, mat_b);
- 89 DLAF_ASSERT(mat_a.blockSize() == mat_b.blockSize(), mat_a, mat_b);
-
- 91 const SizeType size = mat_a.size().rows();
-
-
-
-
+ 61 DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);
+ 62 DLAF_ASSERT(matrix::single_tile_per_block(mat_b), mat_b);
+ 63 DLAF_ASSERT(matrix::single_tile_per_block(eigenvalues), eigenvalues);
+ 64 DLAF_ASSERT(matrix::single_tile_per_block(eigenvectors), eigenvectors);
+
+
+
+
+ 84 template <Backend B, Device D,
class T>
+
+ 86 DLAF_ASSERT(matrix::local_matrix(mat_a), mat_a);
+ 87 DLAF_ASSERT(matrix::local_matrix(mat_b), mat_b);
+ 88 DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
+ 89 DLAF_ASSERT(matrix::square_blocksize(mat_a), mat_a);
+ 90 DLAF_ASSERT(matrix::square_size(mat_b), mat_b);
+ 91 DLAF_ASSERT(matrix::square_blocksize(mat_b), mat_b);
+ 92 DLAF_ASSERT(mat_a.size() == mat_b.size(), mat_a, mat_b);
+ 93 DLAF_ASSERT(mat_a.blockSize() == mat_b.blockSize(), mat_a, mat_b);
+
+ 95 const SizeType size = mat_a.size().rows();
- 97 genEigensolver<B, D, T>(uplo, mat_a, mat_b, eigenvalues, eigenvectors);
-
- 99 return {std::move(eigenvalues), std::move(eigenvectors)};
-
-
- 121 template <Backend B, Device D,
class T>
-
- 123 Matrix<T, D>& mat_b, Matrix<BaseType<T>, D>& eigenvalues,
- 124 Matrix<T, D>& eigenvectors) {
- 125 DLAF_ASSERT(matrix::equal_process_grid(mat_a, grid), mat_a, grid);
- 126 DLAF_ASSERT(matrix::equal_process_grid(mat_b, grid), mat_b, grid);
- 127 DLAF_ASSERT(matrix::local_matrix(eigenvalues), eigenvalues);
- 128 DLAF_ASSERT(matrix::equal_process_grid(eigenvectors, grid), eigenvectors, grid);
- 129 DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
- 130 DLAF_ASSERT(matrix::square_blocksize(mat_a), mat_a);
- 131 DLAF_ASSERT(matrix::square_size(mat_b), mat_b);
- 132 DLAF_ASSERT(matrix::square_blocksize(mat_b), mat_b);
- 133 DLAF_ASSERT(matrix::square_size(eigenvectors), eigenvectors);
- 134 DLAF_ASSERT(matrix::square_blocksize(eigenvectors), eigenvectors);
- 135 DLAF_ASSERT(mat_a.size() == mat_b.size(), mat_a, mat_b);
- 136 DLAF_ASSERT(mat_a.blockSize() == mat_b.blockSize(), mat_a, mat_b);
- 137 DLAF_ASSERT(eigenvalues.size().rows() == eigenvectors.size().rows(), eigenvalues, eigenvectors);
- 138 DLAF_ASSERT(eigenvalues.blockSize().rows() == eigenvectors.blockSize().rows(), eigenvalues,
-
- 140 DLAF_ASSERT(eigenvectors.size() == mat_a.size(), eigenvectors, mat_a);
- 141 DLAF_ASSERT(eigenvectors.blockSize() == mat_a.blockSize(), eigenvectors, mat_a);
-
-
-
-
- 162 template <Backend B, Device D,
class T>
-
- 164 Matrix<T, D>& mat_b) {
- 165 DLAF_ASSERT(matrix::equal_process_grid(mat_a, grid), mat_a, grid);
- 166 DLAF_ASSERT(matrix::equal_process_grid(mat_b, grid), mat_b, grid);
- 167 DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
- 168 DLAF_ASSERT(matrix::square_blocksize(mat_a), mat_a);
- 169 DLAF_ASSERT(matrix::square_size(mat_b), mat_b);
- 170 DLAF_ASSERT(matrix::square_blocksize(mat_b), mat_b);
- 171 DLAF_ASSERT(mat_a.size() == mat_b.size(), mat_a, mat_b);
- 172 DLAF_ASSERT(mat_a.blockSize() == mat_b.blockSize(), mat_a, mat_b);
-
- 174 const SizeType size = mat_a.size().rows();
-
-
-
-
-
- 180 genEigensolver<B, D, T>(grid, uplo, mat_a, mat_b, eigenvalues, eigenvectors);
+
+
+
+
+ 101 genEigensolver<B, D, T>(uplo, mat_a, mat_b, eigenvalues, eigenvectors);
+
+ 103 return {std::move(eigenvalues), std::move(eigenvectors)};
+
+
+ 125 template <Backend B, Device D,
class T>
+
+ 127 Matrix<T, D>& mat_b, Matrix<BaseType<T>, D>& eigenvalues,
+ 128 Matrix<T, D>& eigenvectors) {
+ 129 DLAF_ASSERT(matrix::equal_process_grid(mat_a, grid), mat_a, grid);
+ 130 DLAF_ASSERT(matrix::equal_process_grid(mat_b, grid), mat_b, grid);
+ 131 DLAF_ASSERT(matrix::local_matrix(eigenvalues), eigenvalues);
+ 132 DLAF_ASSERT(matrix::equal_process_grid(eigenvectors, grid), eigenvectors, grid);
+ 133 DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
+ 134 DLAF_ASSERT(matrix::square_blocksize(mat_a), mat_a);
+ 135 DLAF_ASSERT(matrix::square_size(mat_b), mat_b);
+ 136 DLAF_ASSERT(matrix::square_blocksize(mat_b), mat_b);
+ 137 DLAF_ASSERT(matrix::square_size(eigenvectors), eigenvectors);
+ 138 DLAF_ASSERT(matrix::square_blocksize(eigenvectors), eigenvectors);
+ 139 DLAF_ASSERT(mat_a.size() == mat_b.size(), mat_a, mat_b);
+ 140 DLAF_ASSERT(mat_a.blockSize() == mat_b.blockSize(), mat_a, mat_b);
+ 141 DLAF_ASSERT(eigenvalues.size().rows() == eigenvectors.size().rows(), eigenvalues, eigenvectors);
+ 142 DLAF_ASSERT(eigenvalues.blockSize().rows() == eigenvectors.blockSize().rows(), eigenvalues,
+
+ 144 DLAF_ASSERT(eigenvectors.size() == mat_a.size(), eigenvectors, mat_a);
+ 145 DLAF_ASSERT(eigenvectors.blockSize() == mat_a.blockSize(), eigenvectors, mat_a);
+ 146 DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);
+ 147 DLAF_ASSERT(matrix::single_tile_per_block(mat_b), mat_b);
+ 148 DLAF_ASSERT(matrix::single_tile_per_block(eigenvalues), eigenvalues);
+ 149 DLAF_ASSERT(matrix::single_tile_per_block(eigenvectors), eigenvectors);
+
+
+
+
+ 170 template <Backend B, Device D,
class T>
+
+ 172 Matrix<T, D>& mat_b) {
+ 173 DLAF_ASSERT(matrix::equal_process_grid(mat_a, grid), mat_a, grid);
+ 174 DLAF_ASSERT(matrix::equal_process_grid(mat_b, grid), mat_b, grid);
+ 175 DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
+ 176 DLAF_ASSERT(matrix::square_blocksize(mat_a), mat_a);
+ 177 DLAF_ASSERT(matrix::square_size(mat_b), mat_b);
+ 178 DLAF_ASSERT(matrix::square_blocksize(mat_b), mat_b);
+ 179 DLAF_ASSERT(mat_a.size() == mat_b.size(), mat_a, mat_b);
+ 180 DLAF_ASSERT(mat_a.blockSize() == mat_b.blockSize(), mat_a, mat_b);
- 182 return {std::move(eigenvalues), std::move(eigenvectors)};
-
-
+ 182 const SizeType size = mat_a.size().rows();
+
+
+
+
+
+ 188 genEigensolver<B, D, T>(grid, uplo, mat_a, mat_b, eigenvalues, eigenvectors);
+
+ 190 return {std::move(eigenvalues), std::move(eigenvectors)};
+
+
Definition: communicator_grid.h:42
-
+
void genEigensolver(blas::Uplo uplo, Matrix< T, D > &mat_a, Matrix< T, D > &mat_b, Matrix< BaseType< T >, D > &eigenvalues, Matrix< T, D > &eigenvectors)
Definition: gen_eigensolver.h:42
diff --git a/master/gen__to__std_8h.html b/master/gen__to__std_8h.html
index 99a2f4fe22..b7cd4771a9 100644
--- a/master/gen__to__std_8h.html
+++ b/master/gen__to__std_8h.html
@@ -137,6 +137,8 @@
diff --git a/master/norm_8h_source.html b/master/norm_8h_source.html
index df04f2b0cb..e930d0e8a8 100644
--- a/master/norm_8h_source.html
+++ b/master/norm_8h_source.html
@@ -93,49 +93,51 @@
24 namespace dlaf::auxiliary {
- 42 template <Backend backend, Device device,
class T>
-
- 44 blas::Uplo uplo, Matrix<const T, device>& A) {
- 45 using dlaf::matrix::equal_process_grid;
-
- 47 DLAF_ASSERT(equal_process_grid(A, grid), A, grid);
+ 43 template <Backend backend, Device device,
class T>
+
+ 45 blas::Uplo uplo, Matrix<const T, device>& A) {
+ 46 using dlaf::matrix::equal_process_grid;
+ 47 using dlaf::matrix::single_tile_per_block;
-
- 50 if (A.size().isEmpty())
-
-
-
- 54 case lapack::Norm::One:
- 55 case lapack::Norm::Two:
- 56 case lapack::Norm::Inf:
- 57 case lapack::Norm::Fro:
- 58 DLAF_UNIMPLEMENTED(norm_type);
-
- 60 case lapack::Norm::Max:
-
- 62 case blas::Uplo::Lower:
-
- 64 case blas::Uplo::Upper:
- 65 DLAF_UNIMPLEMENTED(norm_type, uplo);
-
- 67 case blas::Uplo::General:
-
-
-
-
-
-
-
-
-
-
+ 49 DLAF_ASSERT(equal_process_grid(A, grid), A, grid);
+ 50 DLAF_ASSERT(single_tile_per_block(A), A);
+
+
+ 53 if (A.size().isEmpty())
+
+
+
+ 57 case lapack::Norm::One:
+ 58 case lapack::Norm::Two:
+ 59 case lapack::Norm::Inf:
+ 60 case lapack::Norm::Fro:
+ 61 DLAF_UNIMPLEMENTED(norm_type);
+
+ 63 case lapack::Norm::Max:
+
+ 65 case blas::Uplo::Lower:
+
+ 67 case blas::Uplo::Upper:
+ 68 DLAF_UNIMPLEMENTED(norm_type, uplo);
+
+ 70 case blas::Uplo::General:
+
+
+
+
+
+
+
+
+
+
Definition: communicator_grid.h:42
-dlaf::BaseType< T > norm(comm::CommunicatorGrid grid, comm::Index2D rank, lapack::Norm norm_type, blas::Uplo uplo, Matrix< const T, device > &A)
Definition: norm.h:43
+dlaf::BaseType< T > norm(comm::CommunicatorGrid grid, comm::Index2D rank, lapack::Norm norm_type, blas::Uplo uplo, Matrix< const T, device > &A)
Definition: norm.h:44
diff --git a/master/panel_8h_source.html b/master/panel_8h_source.html
index 746915844c..090adbd2d2 100644
--- a/master/panel_8h_source.html
+++ b/master/panel_8h_source.html
@@ -552,7 +552,7 @@
IndexT get() const noexcept
Return a copy of the row or the col index as specified by rc.
Definition: index2d.h:86
void transpose() noexcept
Swaps row and column index/size.
Definition: index2d.h:106
Definition: distribution.h:27
-
+
diff --git a/master/permutations_2general_2impl_8h_source.html b/master/permutations_2general_2impl_8h_source.html
index b9c31c3b12..a350e08801 100644
--- a/master/permutations_2general_2impl_8h_source.html
+++ b/master/permutations_2general_2impl_8h_source.html
@@ -655,8 +655,8 @@
SizeType globalElementFromGlobalTileAndTileElement(SizeType global_tile, SizeType tile_element) const noexcept
Definition: distribution.h:247
SizeType nextLocalTileFromGlobalTile(SizeType global_tile) const noexcept
Definition: distribution.h:351
Definition: layout_info.h:26
-
-
+
+
diff --git a/master/permutations_2general_8h_source.html b/master/permutations_2general_8h_source.html
index fd82240d72..affdb84dff 100644
--- a/master/permutations_2general_8h_source.html
+++ b/master/permutations_2general_8h_source.html
@@ -109,51 +109,59 @@
52 DLAF_ASSERT(matrix::equal_size(mat_in, mat_out), mat_in);
53 DLAF_ASSERT(matrix::equal_blocksize(mat_in, mat_out), mat_in);
- 55 DLAF_ASSERT(perms.size().rows() == mat_in.size().rows(), perms, mat_in);
- 56 DLAF_ASSERT(perms.size().cols() == 1, perms);
- 57 DLAF_ASSERT(perms.blockSize().rows() == mat_in.blockSize().rows(), mat_in, perms);
+ 55 DLAF_ASSERT(matrix::single_tile_per_block(perms), perms);
+ 56 DLAF_ASSERT(matrix::single_tile_per_block(mat_in), mat_in);
+ 57 DLAF_ASSERT(matrix::single_tile_per_block(mat_out), mat_out);
- 59 DLAF_ASSERT(i_begin >= 0 && i_begin <= i_end, i_begin, i_end);
- 60 DLAF_ASSERT(i_end <= perms.nrTiles().rows(), i_end, perms);
-
-
-
-
- 85 template <Backend B, Device D,
class T, Coord coord>
-
- 87 SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
- 88 Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out) {
- 89 DLAF_ASSERT(matrix::local_matrix(perms), perms);
- 90 DLAF_ASSERT(matrix::equal_process_grid(mat_in, grid), mat_in, grid);
- 91 DLAF_ASSERT(matrix::equal_process_grid(mat_out, grid), mat_out, grid);
-
-
-
-
-
-
- 98 DLAF_ASSERT(square_size(mat_in), mat_in);
- 99 DLAF_ASSERT(matrix::square_blocksize(mat_in), mat_in);
- 100 DLAF_ASSERT(matrix::equal_size(mat_in, mat_out), mat_in);
- 101 DLAF_ASSERT(matrix::equal_blocksize(mat_in, mat_out), mat_in);
-
- 103 DLAF_ASSERT(perms.size().rows() == mat_in.size().rows(), perms, mat_in);
- 104 DLAF_ASSERT(perms.size().cols() == 1, perms);
- 105 DLAF_ASSERT(perms.blockSize().rows() == mat_in.blockSize().rows(), mat_in, perms);
+ 59 DLAF_ASSERT(perms.size().rows() == mat_in.size().rows(), perms, mat_in);
+ 60 DLAF_ASSERT(perms.size().cols() == 1, perms);
+ 61 DLAF_ASSERT(perms.blockSize().rows() == mat_in.blockSize().rows(), mat_in, perms);
+
+ 63 DLAF_ASSERT(i_begin >= 0 && i_begin <= i_end, i_begin, i_end);
+ 64 DLAF_ASSERT(i_end <= perms.nrTiles().rows(), i_end, perms);
+
+
+
+
+ 89 template <Backend B, Device D,
class T, Coord coord>
+
+ 91 SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
+ 92 Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out) {
+ 93 DLAF_ASSERT(matrix::local_matrix(perms), perms);
+ 94 DLAF_ASSERT(matrix::equal_process_grid(mat_in, grid), mat_in, grid);
+ 95 DLAF_ASSERT(matrix::equal_process_grid(mat_out, grid), mat_out, grid);
+
+
+
+
+
+
+ 102 DLAF_ASSERT(square_size(mat_in), mat_in);
+ 103 DLAF_ASSERT(matrix::square_blocksize(mat_in), mat_in);
+ 104 DLAF_ASSERT(matrix::equal_size(mat_in, mat_out), mat_in);
+ 105 DLAF_ASSERT(matrix::equal_blocksize(mat_in, mat_out), mat_in);
- 107 DLAF_ASSERT(i_begin >= 0 && i_begin <= i_end, i_begin, i_end);
- 108 DLAF_ASSERT(i_end <= perms.nrTiles().rows(), i_end, perms);
-
-
-
-
- 118 template <Backend B, Device D,
class T, Coord coord>
-
- 120 Matrix<const SizeType, D>& perms, Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out) {
-
- 122 permute<B, D, T, coord>(grid, sub_task_chain, i_begin, i_end, perms, mat_in, mat_out);
-
-
+ 107 DLAF_ASSERT(matrix::single_tile_per_block(perms), perms);
+ 108 DLAF_ASSERT(matrix::single_tile_per_block(mat_in), mat_in);
+ 109 DLAF_ASSERT(matrix::single_tile_per_block(mat_out), mat_out);
+
+ 111 DLAF_ASSERT(perms.size().rows() == mat_in.size().rows(), perms, mat_in);
+ 112 DLAF_ASSERT(perms.size().cols() == 1, perms);
+ 113 DLAF_ASSERT(perms.blockSize().rows() == mat_in.blockSize().rows(), mat_in, perms);
+
+ 115 DLAF_ASSERT(i_begin >= 0 && i_begin <= i_end, i_begin, i_end);
+ 116 DLAF_ASSERT(i_end <= perms.nrTiles().rows(), i_end, perms);
+
+
+
+
+ 126 template <Backend B, Device D,
class T, Coord coord>
+
+ 128 Matrix<const SizeType, D>& perms, Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out) {
+
+ 130 permute<B, D, T, coord>(grid, sub_task_chain, i_begin, i_end, perms, mat_in, mat_out);
+
+
Definition: communicator_grid.h:42
Definition: pipeline.h:31
diff --git a/master/print__csv_8h_source.html b/master/print__csv_8h_source.html
index 0067cc52c5..2569826fc1 100644
--- a/master/print__csv_8h_source.html
+++ b/master/print__csv_8h_source.html
@@ -140,7 +140,7 @@
-
+
@@ -150,7 +150,7 @@
-bool local_matrix(const Matrix< const T, D > &m) noexcept
Returns true if the matrix is local to a process.
Definition: util_matrix.h:66
+bool local_matrix(const Matrix< const T, D > &m) noexcept
Returns true if the matrix is local to a process.
Definition: util_matrix.h:72