From 2425d8643275b51dbf6282d6854d9f03c061cd55 Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi <9337627+albestro@users.noreply.github.com> Date: Tue, 12 Dec 2023 13:42:18 +0100 Subject: [PATCH 1/7] TridiagSolver (dist): STEP2b Make rank1 work just on local non-deflated eigenvectors (multi-threaded) (#997) --- .../dlaf/eigensolver/tridiag_solver/merge.h | 684 ++++++++++-------- 1 file changed, 367 insertions(+), 317 deletions(-) diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h index 4fd051b36a..e61c74f7c5 100644 --- a/include/dlaf/eigensolver/tridiag_solver/merge.h +++ b/include/dlaf/eigensolver/tridiag_solver/merge.h @@ -1257,17 +1257,9 @@ void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const S comm, req)); }; - // Note: at least two column of tiles per-worker, in the range [1, getTridiagRank1NWorkers()] - const std::size_t nthreads = [nrtiles = dist_sub.local_nr_tiles().cols()]() { - const std::size_t min_workers = 1; - const std::size_t available_workers = getTridiagRank1NWorkers(); - const std::size_t ideal_workers = util::ceilDiv(to_sizet(nrtiles), to_sizet(2)); - return std::clamp(ideal_workers, min_workers, available_workers); - }(); - + const auto hp_scheduler = di::getBackendScheduler(thread_priority::high); ex::start_detached( - ex::when_all(ex::just(std::make_unique>(nthreads)), - std::forward(row_comm), std::forward(col_comm), + ex::when_all(std::forward(row_comm), std::forward(col_comm), std::forward(k), std::forward(k_lc), std::forward(rho), ex::when_all_vector(tc.read(d)), ex::when_all_vector(tc.readwrite(z)), ex::when_all_vector(tc.readwrite(evals)), ex::when_all_vector(tc.read(i4)), @@ -1276,317 +1268,375 @@ void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const S // additional workspaces ex::just(std::vector>()), ex::just(memory::MemoryView())) | - ex::transfer(di::getBackendScheduler(thread_priority::high)) | - ex::bulk(nthreads, [nthreads, n, dist_sub, bcast_evals, all_reduce_in_place]( - const std::size_t thread_idx, auto& barrier_ptr, auto& row_comm_wrapper, - auto& col_comm_wrapper, const SizeType k, const SizeType k_lc, - const auto& rho, const auto& d_tiles, auto& z_tiles, const auto& eval_tiles, - const auto& i4_tiles_arr, const auto& i6_tiles_arr, - const auto& i2_tiles_arr, const auto& evec_tiles, auto& ws_cols, - auto& ws_row) { - using dlaf::comm::internal::transformMPI; - - common::Pipeline row_comm_chain(row_comm_wrapper.get()); - const dlaf::comm::Communicator& col_comm = col_comm_wrapper.get(); - - const SizeType m_el_lc = dist_sub.local_size().rows(); - const SizeType n_el_lc = dist_sub.local_size().cols(); - - const auto barrier_busy_wait = getTridiagRank1BarrierBusyWait(); - - const SizeType* i4 = i4_tiles_arr[0].get().ptr(); - const SizeType* i2 = i2_tiles_arr[0].get().ptr(); - const SizeType* i6 = i6_tiles_arr[0].get().ptr(); - - // STEP 0a: Fill ones for deflated Eigenvectors and copy related Eigenvalues (single-thread) - // Note: this step is completely independent from the rest, but it is small and it is going - // to be dropped soon. - // Note: use last threads that in principle should have less work to do - if (k < n && thread_idx == nthreads - 1) { - const T* eval_initial_ptr = d_tiles[0].get().ptr(); - T* eval_ptr = eval_tiles[0].ptr(); - - for (SizeType j_el_lc = k_lc; j_el_lc < n_el_lc; ++j_el_lc) { - const SizeType j_el = dist_sub.global_element_from_local_element(j_el_lc); - const SizeType i_el = j_el; - - if (dist_sub.rank_index().row() == dist_sub.rank_global_element(i_el)) { - const SizeType i_el_lc = dist_sub.local_element_from_global_element(i_el); - const LocalTileIndex i_lc{dist_sub.local_tile_from_local_element(i_el_lc), - dist_sub.local_tile_from_local_element(j_el_lc)}; - const SizeType linear_lc = dist_extra::local_tile_linear_index(dist_sub, i_lc); - const TileElementIndex - ij_el_tl{dist_sub.tile_element_from_local_element(i_el_lc), - dist_sub.tile_element_from_local_element(j_el_lc)}; - - evec_tiles[to_sizet(linear_lc)](ij_el_tl) = T{1}; - } - - eval_ptr[j_el] = eval_initial_ptr[i6[j_el]]; - } - } - - // STEP 0b: Initialize workspaces (single-thread) - if (thread_idx == 0) { - // Note: - // - nthreads are used for both LAED4 and weight calculation (one per worker thread) - // - last one is used for reducing weights from all workers - ws_cols.reserve(nthreads + 1); - - // Note: - // Considering that - // - LAED4 requires working on k elements - // - Weight computation requires working on m_el_lc - // - // and they are needed at two steps that cannot happen in parallel, we opted for allocating - // the workspace with the highest requirement of memory, and reuse them for both steps. - const SizeType max_size = std::max(k, m_el_lc); - for (std::size_t i = 0; i < nthreads; ++i) - ws_cols.emplace_back(max_size); - ws_cols.emplace_back(m_el_lc); - - ws_row = memory::MemoryView(n_el_lc); - std::fill_n(ws_row(), n_el_lc, 0); - } - - // Note: we have to wait that LAED4 workspaces are ready to be used - barrier_ptr->arrive_and_wait(barrier_busy_wait); - - const T* d_ptr = d_tiles[0].get().ptr(); - const T* z_ptr = z_tiles[0].ptr(); - - // STEP 1: LAED4 (multi-thread) - if (thread_idx == 0) { // TODO make it multi-threaded over multiple workers - common::internal::SingleThreadedBlasScope single; - - T* eval_ptr = eval_tiles[0].ptr(); - T* delta_ptr = ws_cols[thread_idx](); - - for (SizeType j_el_lc = 0; j_el_lc < k_lc; ++j_el_lc) { - const SizeType j_el = dist_sub.global_element_from_local_element(j_el_lc); - const SizeType j_lc = dist_sub.local_tile_from_local_element(j_el_lc); - - // Solve the deflated rank-1 problem - // Note: - // Input eigenvalues are stored "deflated" with i3, but laed4 is going to store them - // "locally" deflated, i.e. locally it is valid sort(non-deflated)|sort(deflated) - const SizeType js_el = i6[j_el]; - T& eigenval = eval_ptr[to_sizet(j_el)]; - lapack::laed4(to_signed(k), to_signed(js_el), d_ptr, z_ptr, delta_ptr, rho, - &eigenval); - - // Now laed4 result has to be copied in the right spot - const SizeType j_el_tl = dist_sub.tile_element_from_global_element(j_el); - - for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) { - const SizeType i_el = dist_sub.global_element_from_local_element(i_el_lc); - const SizeType is_el = i4[i_el]; - - // just non-deflated, because deflated have been already set to 0 - if (is_el < k) { - const SizeType i_lc = dist_sub.local_tile_from_local_element(i_el_lc); - const SizeType linear_lc = dist_extra::local_tile_linear_index(dist_sub, {i_lc, j_lc}); - const auto& evec = evec_tiles[to_sizet(linear_lc)]; - const SizeType i_el_tl = dist_sub.tile_element_from_local_element(i_el_lc); - evec({i_el_tl, j_el_tl}) = delta_ptr[is_el]; - } - } - } - } - // Note: This barrier ensures that LAED4 finished, so from now on values are available - barrier_ptr->arrive_and_wait(barrier_busy_wait); - - // STEP 2: Broadcast evals - - // Note: this ensures that evals broadcasting finishes before bulk releases resources - ScopedSenderWait bcast_barrier; - if (thread_idx == 0) - bcast_barrier.sender_ = bcast_evals(row_comm_chain, eval_tiles); - - // Note: laed4 handles k <= 2 cases differently - if (k <= 2) - return; - - // STEP 2 Compute weights (multi-thread) - auto& q = evec_tiles; - T* w = ws_cols[thread_idx](); - - // STEP 2a: copy diagonal from q -> w (or just initialize with 1) - if (thread_idx == 0) { - for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) { - const SizeType i_el = dist_sub.global_element_from_local_element(i_el_lc); - const SizeType is_el = i4[i_el]; - - if (is_el >= k) { - w[i_el_lc] = T{0}; - continue; - } - - const SizeType js_el = is_el; - const SizeType j_el = i2[js_el]; - - const GlobalElementIndex ij_subm_el(i_el, j_el); - - if (dist_sub.rank_index().col() == dist_sub.rank_global_element(j_el)) { - const SizeType linear_subm_lc = dist_extra::local_tile_linear_index( - dist_sub, {dist_sub.local_tile_from_local_element(i_el_lc), - dist_sub.local_tile_from_global_element(j_el)}); - const TileElementIndex ij_tl = dist_sub.tile_element_index(ij_subm_el); - w[i_el_lc] = q[to_sizet(linear_subm_lc)](ij_tl); - } - else { - w[i_el_lc] = T{1}; - } - } - } - else { // other workers - std::fill_n(w, m_el_lc, T(1)); - } - - barrier_ptr->arrive_and_wait(barrier_busy_wait); - - // STEP 2b: compute weights - if (thread_idx == 0) { // TODO make it multithreaded again - for (SizeType j_el_lc = 0; j_el_lc < k_lc; ++j_el_lc) { - const SizeType j_el = dist_sub.global_element_from_local_element(j_el_lc); - const SizeType js_el = i6[j_el]; - - for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) { - const SizeType i_el = dist_sub.global_element_from_local_element(i_el_lc); - const SizeType is_el = i4[i_el]; - - // skip if deflated - if (is_el >= k) - continue; - - // skip if originally it was on the diagonal - if (is_el == js_el) - continue; - - const SizeType linear_lc = dist_extra::local_tile_linear_index( - dist_sub, {dist_sub.local_tile_from_local_element(i_el_lc), - dist_sub.local_tile_from_local_element(j_el_lc)}); - const TileElementIndex ij_tl = dist_sub.tile_element_index({i_el, j_el}); - - w[i_el_lc] *= - q[to_sizet(linear_lc)](ij_tl) / (d_ptr[to_sizet(is_el)] - d_ptr[to_sizet(js_el)]); - } - } - } - - barrier_ptr->arrive_and_wait(barrier_busy_wait); - - // STEP 2c: reduce, then finalize computation with sign and square root (single-thread) - if (thread_idx == 0) { - // local reduction from all bulk workers - for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) { - for (std::size_t tidx = 1; tidx < nthreads; ++tidx) { - const T* w_partial = ws_cols[tidx](); - w[i_el_lc] *= w_partial[i_el_lc]; - } - } - - tt::sync_wait(ex::when_all(row_comm_chain(), - ex::just(MPI_PROD, common::make_data(w, m_el_lc))) | - transformMPI(all_reduce_in_place)); + ex::transfer(hp_scheduler) | + ex::let_value([n, dist_sub, bcast_evals, all_reduce_in_place, hp_scheduler]( + auto& row_comm_wrapper, auto& col_comm_wrapper, const SizeType k, + const SizeType k_lc, const auto& rho, const auto& d_tiles, auto& z_tiles, + const auto& eval_tiles, const auto& i4_tiles_arr, const auto& i6_tiles_arr, + const auto& i2_tiles_arr, const auto& evec_tiles, auto& ws_cols, auto& ws_row) { + using pika::execution::thread_priority; + + const std::size_t nthreads = [dist_sub, k_lc] { + const std::size_t workload = to_sizet(dist_sub.localSize().rows() * k_lc); + const std::size_t workload_unit = 2 * to_sizet(dist_sub.tile_size().linear_size()); + + const std::size_t min_workers = 1; + const std::size_t available_workers = getTridiagRank1NWorkers(); + + const std::size_t ideal_workers = util::ceilDiv(to_sizet(workload), workload_unit); + return std::clamp(ideal_workers, min_workers, available_workers); + }(); + + return ex::just(std::make_unique>(nthreads)) | ex::transfer(hp_scheduler) | + ex::bulk(nthreads, [&row_comm_wrapper, &col_comm_wrapper, k, k_lc, &rho, &d_tiles, + &z_tiles, &eval_tiles, &i4_tiles_arr, &i6_tiles_arr, &i2_tiles_arr, + &evec_tiles, &ws_cols, &ws_row, nthreads, n, dist_sub, bcast_evals, + all_reduce_in_place](const std::size_t thread_idx, + auto& barrier_ptr) { + using dlaf::comm::internal::transformMPI; + + common::Pipeline row_comm_chain(row_comm_wrapper.get()); + const dlaf::comm::Communicator& col_comm = col_comm_wrapper.get(); + + const SizeType m_lc = dist_sub.local_nr_tiles().rows(); + const SizeType m_el_lc = dist_sub.local_size().rows(); + const SizeType n_el_lc = dist_sub.local_size().cols(); + + const auto barrier_busy_wait = getTridiagRank1BarrierBusyWait(); + + const SizeType* i4 = i4_tiles_arr[0].get().ptr(); + const SizeType* i2 = i2_tiles_arr[0].get().ptr(); + const SizeType* i6 = i6_tiles_arr[0].get().ptr(); + + // STEP 0a: Fill ones for deflated Eigenvectors and copy related Eigenvalues (single-thread) + // Note: this step is completely independent from the rest, but it is small and it is going + // to be dropped soon. + // Note: use last threads that in principle should have less work to do + if (k < n && thread_idx == nthreads - 1) { + const T* eval_initial_ptr = d_tiles[0].get().ptr(); + T* eval_ptr = eval_tiles[0].ptr(); + + for (SizeType j_el_lc = k_lc; j_el_lc < n_el_lc; ++j_el_lc) { + const SizeType j_el = + dist_sub.global_element_from_local_element(j_el_lc); + const SizeType i_el = j_el; + + if (dist_sub.rank_index().row() == dist_sub.rank_global_element(i_el)) { + const SizeType i_el_lc = + dist_sub.local_element_from_global_element(i_el); + const LocalTileIndex + i_lc{dist_sub.local_tile_from_local_element(i_el_lc), + dist_sub.local_tile_from_local_element(j_el_lc)}; + const SizeType linear_lc = dist_extra::local_tile_linear_index(dist_sub, i_lc); + const TileElementIndex + ij_el_tl{dist_sub.tile_element_from_local_element(i_el_lc), + dist_sub.tile_element_from_local_element(j_el_lc)}; + + evec_tiles[to_sizet(linear_lc)](ij_el_tl) = T{1}; + } + + eval_ptr[j_el] = eval_initial_ptr[i6[j_el]]; + } + } + + const std::size_t batch_size = util::ceilDiv(to_sizet(k_lc), nthreads); + const SizeType begin = to_SizeType(thread_idx * batch_size); + const SizeType end = std::min(to_SizeType(thread_idx * batch_size + batch_size), k_lc); + + // STEP 0b: Initialize workspaces (single-thread) + if (thread_idx == 0) { + // Note: + // - nthreads are used for both LAED4 and weight calculation (one per worker thread) + // - last one is used for reducing weights from all workers + ws_cols.reserve(nthreads + 1); + + // Note: + // Considering that + // - LAED4 requires working on k elements + // - Weight computation requires working on m_el_lc + // + // and they are needed at two steps that cannot happen in parallel, we opted for allocating + // the workspace with the highest requirement of memory, and reuse them for both steps. + const SizeType max_size = std::max(k, m_el_lc); + for (std::size_t i = 0; i < nthreads; ++i) + ws_cols.emplace_back(max_size); + ws_cols.emplace_back(m_el_lc); + + ws_row = memory::MemoryView(n_el_lc); + std::fill_n(ws_row(), n_el_lc, 0); + } + + // Note: we have to wait that LAED4 workspaces are ready to be used + barrier_ptr->arrive_and_wait(barrier_busy_wait); + + const T* d_ptr = d_tiles[0].get().ptr(); + const T* z_ptr = z_tiles[0].ptr(); + + // STEP 1: LAED4 (multi-thread) + { + common::internal::SingleThreadedBlasScope single; + + T* eval_ptr = eval_tiles[0].ptr(); + T* delta_ptr = ws_cols[thread_idx](); + + for (SizeType j_el_lc = begin; j_el_lc < end; ++j_el_lc) { + const SizeType j_el = + dist_sub.global_element_from_local_element(j_el_lc); + const SizeType j_lc = dist_sub.local_tile_from_local_element(j_el_lc); + + // Solve the deflated rank-1 problem + // Note: + // Input eigenvalues are stored "deflated" with i3, but laed4 is going to store them + // "locally" deflated, i.e. locally it is valid sort(non-deflated)|sort(deflated) + const SizeType js_el = i6[j_el]; + T& eigenval = eval_ptr[to_sizet(j_el)]; + lapack::laed4(to_signed(k), to_signed(js_el), d_ptr, z_ptr, + delta_ptr, rho, &eigenval); + + // Now laed4 result has to be copied in the right spot + const SizeType j_el_tl = + dist_sub.tile_element_from_global_element(j_el); + + for (SizeType i_lc = 0; i_lc < m_lc; ++i_lc) { + const SizeType i = dist_sub.global_tile_from_local_tile(i_lc); + const SizeType m_el_tl = dist_sub.tile_size_of(i); + const SizeType linear_lc = + dist_extra::local_tile_linear_index(dist_sub, {i_lc, j_lc}); + const auto& evec = evec_tiles[to_sizet(linear_lc)]; + for (SizeType i_el_tl = 0; i_el_tl < m_el_tl; ++i_el_tl) { + const SizeType i_el = + dist_sub.global_element_from_local_tile_and_tile_element( + i_lc, i_el_tl); + DLAF_ASSERT_HEAVY(i_el < n, i_el, n); + const SizeType is_el = i4[i_el]; + + // just non-deflated, because deflated have been already set to 0 + if (is_el < k) + evec({i_el_tl, j_el_tl}) = delta_ptr[is_el]; + } + } + } + } + // Note: This barrier ensures that LAED4 finished, so from now on values are available + barrier_ptr->arrive_and_wait(barrier_busy_wait); + + // STEP 2: Broadcast evals + + // Note: this ensures that evals broadcasting finishes before bulk releases resources + ScopedSenderWait bcast_barrier; + if (thread_idx == 0) + bcast_barrier.sender_ = bcast_evals(row_comm_chain, eval_tiles); + + // Note: laed4 handles k <= 2 cases differently + if (k <= 2) + return; + + // STEP 2 Compute weights (multi-thread) + auto& q = evec_tiles; + T* w = ws_cols[thread_idx](); + + // STEP 2a: copy diagonal from q -> w (or just initialize with 1) + if (thread_idx == 0) { + for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) { + const SizeType i_el = + dist_sub.global_element_from_local_element(i_el_lc); + const SizeType is_el = i4[i_el]; + + if (is_el >= k) { + w[i_el_lc] = T{0}; + continue; + } + + const SizeType js_el = is_el; + const SizeType j_el = i2[js_el]; + + const GlobalElementIndex ij_subm_el(i_el, j_el); + + if (dist_sub.rank_index().col() == dist_sub.rank_global_element(j_el)) { + const SizeType linear_subm_lc = dist_extra::local_tile_linear_index( + dist_sub, {dist_sub.local_tile_from_local_element(i_el_lc), + dist_sub.local_tile_from_global_element(j_el)}); + const TileElementIndex ij_tl = dist_sub.tile_element_index(ij_subm_el); + w[i_el_lc] = q[to_sizet(linear_subm_lc)](ij_tl); + } + else { + w[i_el_lc] = T{1}; + } + } + } + else { // other workers + std::fill_n(w, m_el_lc, T(1)); + } + + barrier_ptr->arrive_and_wait(barrier_busy_wait); + + // STEP 2b: compute weights + { + for (SizeType j_el_lc = begin; j_el_lc < end; ++j_el_lc) { + const SizeType j_el = + dist_sub.global_element_from_local_element(j_el_lc); + const SizeType j_lc = dist_sub.local_tile_from_global_element(j_el); + const SizeType js_el = i6[j_el]; + const T delta_j = d_ptr[to_sizet(js_el)]; + + const SizeType j_el_tl = + dist_sub.tile_element_from_local_element(j_el_lc); + + for (SizeType i_lc = 0; i_lc < m_lc; ++i_lc) { + const SizeType i = dist_sub.global_tile_from_local_tile(i_lc); + const SizeType m_el_tl = dist_sub.tile_size_of(i); + const SizeType linear_lc = + dist_extra::local_tile_linear_index(dist_sub, {i_lc, j_lc}); + const auto& q_tile = q[to_sizet(linear_lc)]; + + for (SizeType i_el_tl = 0; i_el_tl < m_el_tl; ++i_el_tl) { + const SizeType i_el = + dist_sub.global_element_from_global_tile_and_tile_element( + i, i_el_tl); + DLAF_ASSERT_HEAVY(i_el < n, i_el, n); + const SizeType is_el = i4[i_el]; + + // skip if deflated + if (is_el >= k) + continue; + + // skip if originally it was on the diagonal + if (is_el == js_el) + continue; + + const SizeType i_el_lc = + dist_sub.local_element_from_local_tile_and_tile_element( + i_lc, i_el_tl); + const TileElementIndex ij_tl(i_el_tl, j_el_tl); + + w[i_el_lc] *= q_tile(ij_tl) / (d_ptr[to_sizet(is_el)] - delta_j); + } + } + } + } + + barrier_ptr->arrive_and_wait(barrier_busy_wait); + + // STEP 2c: reduce, then finalize computation with sign and square root (single-thread) + if (thread_idx == 0) { + // local reduction from all bulk workers + for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) { + for (std::size_t tidx = 1; tidx < nthreads; ++tidx) { + const T* w_partial = ws_cols[tidx](); + w[i_el_lc] *= w_partial[i_el_lc]; + } + } + + tt::sync_wait(ex::when_all(row_comm_chain(), + ex::just(MPI_PROD, common::make_data(w, m_el_lc))) | + transformMPI(all_reduce_in_place)); #ifdef DLAF_ASSERT_HEAVY_ENABLE - // Note: all input for weights computation of non-deflated rows should be strictly less than 0 - for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) { - const SizeType i_el = dist_sub.global_element_from_local_element(i_el_lc); - const SizeType is = i4[i_el]; - if (is < k) - DLAF_ASSERT_HEAVY(w[i_el_lc] < 0, w[i_el_lc]); - } + // Note: all input for weights computation of non-deflated rows should be strictly less than 0 + for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) { + const SizeType i_el = + dist_sub.global_element_from_local_element(i_el_lc); + const SizeType is = i4[i_el]; + if (is < k) + DLAF_ASSERT_HEAVY(w[i_el_lc] < 0, w[i_el_lc]); + } #endif - T* weights = ws_cols[nthreads](); - for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) { - const SizeType i_el = dist_sub.global_element_from_local_element(i_el_lc); - const SizeType is_el = i4[i_el]; - weights[to_sizet(i_el_lc)] = std::copysign(std::sqrt(-w[i_el_lc]), z_ptr[to_sizet(is_el)]); - } - } - - barrier_ptr->arrive_and_wait(barrier_busy_wait); - - // STEP 3: Compute eigenvectors of the modified rank-1 modification (normalize) (multi-thread) - - // STEP 3a: Form evecs using weights vector and compute (local) sum of squares - if (thread_idx == 0) { // TODO make it multithreaded again - common::internal::SingleThreadedBlasScope single; - - const T* w = ws_cols[nthreads](); - T* sum_squares = ws_row(); - - for (SizeType j_el_lc = 0; j_el_lc < k_lc; ++j_el_lc) { - const SizeType j_lc = dist_sub.local_tile_from_local_element(j_el_lc); - const SizeType j_el_tl = dist_sub.tile_element_from_local_element(j_el_lc); - - for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) { - const SizeType i_el = dist_sub.global_element_from_local_element(i_el_lc); - const SizeType is_el = i4[i_el]; - - // it is a deflated row, skip it (it should be already 0) - if (is_el >= k) - continue; - - const LocalTileIndex ij_lc(dist_sub.local_tile_from_local_element(i_el_lc), - j_lc); - const SizeType ij_linear = dist_extra::local_tile_linear_index(dist_sub, ij_lc); - const TileElementIndex ij_el_tl( - dist_sub.tile_element_from_local_element(i_el_lc), j_el_tl); - - const auto& q_tile = q[to_sizet(ij_linear)]; - - q_tile(ij_el_tl) = w[i_el_lc] / q_tile(ij_el_tl); - } - - // column-major once the full column has been updated, compute the sum of squares (for norm) - for (SizeType i_lc = 0; i_lc < dist_sub.local_nr_tiles().rows(); ++i_lc) { - const LocalTileIndex ij_lc(i_lc, j_lc); - const SizeType ij_linear = dist_extra::local_tile_linear_index(dist_sub, ij_lc); - const T* partial_evec = q[to_sizet(ij_linear)].ptr({0, j_el_tl}); - const SizeType i = dist_sub.global_tile_from_local_tile(i_lc); - const SizeType m_el_tl = dist_sub.tile_size_of(i); - sum_squares[j_el_lc] += blas::dot(m_el_tl, partial_evec, 1, partial_evec, 1); - } - } - } - - barrier_ptr->arrive_and_wait(barrier_busy_wait); - - // STEP 3b: Reduce to get the sum of all squares on all ranks - if (thread_idx == 0) - tt::sync_wait(ex::just(std::cref(col_comm), MPI_SUM, common::make_data(ws_row(), k_lc)) | - transformMPI(all_reduce_in_place)); - - barrier_ptr->arrive_and_wait(barrier_busy_wait); - - // STEP 3c: Normalize (compute norm of each column and scale column vector) - if (thread_idx == 0) { // TODO make it multithreaded again - common::internal::SingleThreadedBlasScope single; - - const T* sum_squares = ws_row(); - - for (SizeType j_el_lc = 0; j_el_lc < k_lc; ++j_el_lc) { - const SizeType j_lc = dist_sub.local_tile_from_local_element(j_el_lc); - const SizeType j_el_tl = dist_sub.tile_element_from_local_element(j_el_lc); - - const T vec_norm = std::sqrt(sum_squares[j_el_lc]); - - for (SizeType i_lc = 0; i_lc < dist_sub.local_nr_tiles().rows(); ++i_lc) { - const LocalTileIndex ij_lc(i_lc, j_lc); - const SizeType ij_linear = dist_extra::local_tile_linear_index(dist_sub, ij_lc); - - T* partial_evec = q[to_sizet(ij_linear)].ptr({0, j_el_tl}); - - const SizeType i = dist_sub.global_tile_from_local_tile(i_lc); - const SizeType m_el_tl = dist_sub.tile_size_of(i); - blas::scal(m_el_tl, 1 / vec_norm, partial_evec, 1); - } - } - } + T* weights = ws_cols[nthreads](); + for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) { + const SizeType i_el = + dist_sub.global_element_from_local_element(i_el_lc); + const SizeType is_el = i4[i_el]; + weights[to_sizet(i_el_lc)] = + std::copysign(std::sqrt(-w[i_el_lc]), z_ptr[to_sizet(is_el)]); + } + } + + barrier_ptr->arrive_and_wait(barrier_busy_wait); + + // STEP 3: Compute eigenvectors of the modified rank-1 modification (normalize) (multi-thread) + + // STEP 3a: Form evecs using weights vector and compute (local) sum of squares + { + common::internal::SingleThreadedBlasScope single; + + const T* w = ws_cols[nthreads](); + T* sum_squares = ws_row(); + + for (SizeType j_el_lc = begin; j_el_lc < end; ++j_el_lc) { + const SizeType j_lc = dist_sub.local_tile_from_local_element(j_el_lc); + const SizeType j_el_tl = + dist_sub.tile_element_from_local_element(j_el_lc); + + for (SizeType i_lc = 0; i_lc < m_lc; ++i_lc) { + const SizeType i = dist_sub.global_tile_from_local_tile(i_lc); + const SizeType m_el_tl = dist_sub.tile_size_of(i); + const SizeType linear_lc = + dist_extra::local_tile_linear_index(dist_sub, {i_lc, j_lc}); + const auto& q_tile = q[to_sizet(linear_lc)]; + + for (SizeType i_el_tl = 0; i_el_tl < m_el_tl; ++i_el_tl) { + const SizeType i_el = + dist_sub.global_element_from_global_tile_and_tile_element( + i, i_el_tl); + + DLAF_ASSERT_HEAVY(i_el < n, i_el, n); + const SizeType is_el = i4[i_el]; + + // it is a deflated row, skip it (it should be already 0) + if (is_el >= k) + continue; + + const SizeType i_el_lc = + dist_sub.local_element_from_local_tile_and_tile_element( + i_lc, i_el_tl); + const TileElementIndex ij_el_tl(i_el_tl, j_el_tl); + + q_tile(ij_el_tl) = w[i_el_lc] / q_tile(ij_el_tl); + } + + const T* partial_evec = q_tile.ptr({0, j_el_tl}); + sum_squares[j_el_lc] += blas::dot(m_el_tl, partial_evec, 1, partial_evec, 1); + } + } + } + + barrier_ptr->arrive_and_wait(barrier_busy_wait); + + // STEP 3b: Reduce to get the sum of all squares on all ranks + if (thread_idx == 0) + tt::sync_wait(ex::just(std::cref(col_comm), MPI_SUM, + common::make_data(ws_row(), k_lc)) | + transformMPI(all_reduce_in_place)); + + barrier_ptr->arrive_and_wait(barrier_busy_wait); + + // STEP 3c: Normalize (compute norm of each column and scale column vector) + { + common::internal::SingleThreadedBlasScope single; + + const T* sum_squares = ws_row(); + + for (SizeType j_el_lc = begin; j_el_lc < end; ++j_el_lc) { + const SizeType j_lc = dist_sub.local_tile_from_local_element(j_el_lc); + const SizeType j_el_tl = + dist_sub.tile_element_from_local_element(j_el_lc); + + const T vec_norm = std::sqrt(sum_squares[j_el_lc]); + + for (SizeType i_lc = 0; i_lc < m_lc; ++i_lc) { + const LocalTileIndex ij_lc(i_lc, j_lc); + const SizeType ij_linear = dist_extra::local_tile_linear_index(dist_sub, ij_lc); + + T* partial_evec = q[to_sizet(ij_linear)].ptr({0, j_el_tl}); + + const SizeType i = dist_sub.global_tile_from_local_tile(i_lc); + const SizeType m_el_tl = dist_sub.tile_size_of(i); + blas::scal(m_el_tl, 1 / vec_norm, partial_evec, 1); + } + } + } + }); })); } From 72b5210b21447bd4b61b19a54a4b3f6bf0f0ecef Mon Sep 17 00:00:00 2001 From: guglielmogagliardi <94535690+gulivarese@users.noreply.github.com> Date: Tue, 12 Dec 2023 13:45:52 +0100 Subject: [PATCH 2/7] add miniapp triangular multiplication (#1029) --- miniapp/CMakeLists.txt | 2 + miniapp/miniapp_triangular_multiplication.cpp | 288 ++++++++++++++++++ miniapp/miniapp_triangular_solver.cpp | 1 + scripts/miniapps.py | 35 +++ scripts/plot_trmm_strong.py | 20 ++ scripts/plot_trmm_weak.py | 20 ++ scripts/postprocess.py | 78 ++++- 7 files changed, 441 insertions(+), 3 deletions(-) create mode 100644 miniapp/miniapp_triangular_multiplication.cpp create mode 100755 scripts/plot_trmm_strong.py create mode 100755 scripts/plot_trmm_weak.py diff --git a/miniapp/CMakeLists.txt b/miniapp/CMakeLists.txt index 9d5b054928..8f3c145f15 100644 --- a/miniapp/CMakeLists.txt +++ b/miniapp/CMakeLists.txt @@ -39,6 +39,8 @@ DLAF_addMiniapp(miniapp_bt_reduction_to_band SOURCES miniapp_bt_reduction_to_ban DLAF_addMiniapp(miniapp_triangular_solver SOURCES miniapp_triangular_solver.cpp) +DLAF_addMiniapp(miniapp_triangular_multiplication SOURCES miniapp_triangular_multiplication.cpp) + DLAF_addMiniapp(miniapp_eigensolver SOURCES miniapp_eigensolver.cpp) DLAF_addMiniapp(miniapp_gen_eigensolver SOURCES miniapp_gen_eigensolver.cpp) diff --git a/miniapp/miniapp_triangular_multiplication.cpp b/miniapp/miniapp_triangular_multiplication.cpp new file mode 100644 index 0000000000..4cd0e52692 --- /dev/null +++ b/miniapp/miniapp_triangular_multiplication.cpp @@ -0,0 +1,288 @@ +// +// Distributed Linear Algebra with Future (DLAF) +// +// Copyright (c) 2018-2023, ETH Zurich +// All rights reserved. +// +// Please, refer to the LICENSE file in the root directory. +// SPDX-License-Identifier: BSD-3-Clause +// + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +using dlaf::Backend; +using dlaf::DefaultDevice_v; +using dlaf::Device; +using dlaf::GlobalElementIndex; +using dlaf::GlobalElementSize; +using dlaf::SizeType; +using dlaf::TileElementSize; +using dlaf::comm::Communicator; +using dlaf::comm::CommunicatorGrid; +using dlaf::common::Ordering; + +struct Options + : dlaf::miniapp::MiniappOptions { + SizeType m; + SizeType n; + SizeType mb; + SizeType nb; + blas::Side side; + blas::Uplo uplo; + blas::Op op; + blas::Diag diag; + + Options(const pika::program_options::variables_map& vm) + : MiniappOptions(vm), m(vm["m"].as()), n(vm["n"].as()), + mb(vm["mb"].as()), nb(vm["nb"].as()), + side(dlaf::miniapp::parseSide(vm["side"].as())), + uplo(dlaf::miniapp::parseUplo(vm["uplo"].as())), + op(dlaf::miniapp::parseOp(vm["op"].as())), + diag(dlaf::miniapp::parseDiag(vm["diag"].as())) { + DLAF_ASSERT(m > 0 && n > 0, m, n); + DLAF_ASSERT(mb > 0 && nb > 0, mb, nb); + + if (do_check != dlaf::miniapp::CheckIterFreq::None) { + std::cerr << "Warning! At the moment result checking it is not implemented." << std::endl; + do_check = dlaf::miniapp::CheckIterFreq::None; + } + } + + Options(Options&&) = default; + Options(const Options&) = default; + Options& operator=(Options&&) = default; + Options& operator=(const Options&) = default; +}; + +template +using matrix_values_t = std::function; + +template +using linear_system_t = + std::tuple, matrix_values_t, matrix_values_t>; // A, B, X + +template +linear_system_t sampleLeftTr(blas::Uplo uplo, blas::Op op, blas::Diag diag, T alpha, SizeType m); +} + +struct triangularMultiplicationMiniapp { + template + static void run(const Options& opts) { + Communicator world(MPI_COMM_WORLD); + CommunicatorGrid comm_grid(world, opts.grid_rows, opts.grid_cols, Ordering::ColumnMajor); + + // Allocate memory for the matrices + dlaf::matrix::Matrix ah(GlobalElementSize{opts.m, opts.m}, + TileElementSize{opts.mb, opts.mb}, comm_grid); + dlaf::matrix::Matrix bh(GlobalElementSize{opts.m, opts.n}, + TileElementSize{opts.mb, opts.nb}, comm_grid); + + dlaf::matrix::MatrixMirror, Device::CPU> a(ah); + dlaf::matrix::MatrixMirror, Device::CPU> b(bh); + + auto sync_barrier = [&]() { + a.get().waitLocalTiles(); + b.get().waitLocalTiles(); + DLAF_MPI_CHECK_ERROR(MPI_Barrier(world)); + }; + + const auto side = opts.side; + DLAF_ASSERT(side == blas::Side::Left, side); + + const auto uplo = opts.uplo; + const auto op = opts.op; + const auto diag = opts.diag; + const T alpha = 2.0; + + double m = ah.size().rows(); + double n = bh.size().cols(); + auto add_mul = n * m * m / 2; + const double total_ops = dlaf::total_ops(add_mul, add_mul); + + auto [in_op_a, out_b, in_b] = ::sampleLeftTr(uplo, op, diag, alpha, ah.size().rows()); + + for (int64_t run_index = -opts.nwarmups; run_index < opts.nruns; ++run_index) { + if (0 == world.rank() && run_index >= 0) + std::cout << "[" << run_index << "]" << std::endl; + + // setup matrix A and b + using dlaf::matrix::util::set; + set(ah, in_op_a, op); + set(bh, in_b); + a.copySourceToTarget(); + b.copySourceToTarget(); + + sync_barrier(); + + dlaf::common::Timer<> timeit; + if (opts.local) + dlaf::triangular_multiplication, T>(side, uplo, op, diag, + alpha, a.get(), + b.get()); + else + dlaf::triangular_multiplication, T>( + comm_grid, side, uplo, op, diag, alpha, a.get(), b.get()); + + sync_barrier(); + + // benchmark results + if (0 == world.rank() && run_index >= 0) { + auto elapsed_time = timeit.elapsed(); + double gigaflops = total_ops / elapsed_time / 1e9; + + std::cout << "[" << run_index << "]" + << " " << elapsed_time << "s" + << " " << gigaflops << "GFlop/s" + << " " << dlaf::internal::FormatShort{opts.type} + << dlaf::internal::FormatShort{opts.side} << dlaf::internal::FormatShort{opts.uplo} + << dlaf::internal::FormatShort{opts.op} << dlaf::internal::FormatShort{opts.diag} + << " " << bh.size() << " " << bh.blockSize() << " " << comm_grid.size() << " " + << pika::get_os_thread_count() << " " << backend << std::endl; + } + + b.copyTargetToSource(); + + // (optional) run test + if ((opts.do_check == dlaf::miniapp::CheckIterFreq::Last && run_index == (opts.nruns - 1)) || + opts.do_check == dlaf::miniapp::CheckIterFreq::All) { + DLAF_UNIMPLEMENTED("Check"); + } + } + } +}; + +int pika_main(pika::program_options::variables_map& vm) { + pika::scoped_finalize pika_finalizer; + dlaf::ScopedInitializer init(vm); + + const Options opts(vm); + dlaf::miniapp::dispatchMiniapp(opts); + + return EXIT_SUCCESS; +} + +int main(int argc, char** argv) { + dlaf::comm::mpi_init mpi_initter(argc, argv); + + // options + using namespace pika::program_options; + options_description desc_commandline( + "Benchmark computation of A . B, " + "where A is a non-unit lower triangular matrix, and B is an m by n matrix\n\n" + "options\n" + "Usage: miniapp_triangular_multiplication [options]"); + desc_commandline.add(dlaf::miniapp::getMiniappOptionsDescription()); + desc_commandline.add(dlaf::getOptionsDescription()); + + // clang-format off + desc_commandline.add_options() + ("m", value() ->default_value(4096), "Matrix b rows") + ("n", value() ->default_value(512), "Matrix b columns") + ("mb", value() ->default_value(256), "Matrix b block rows") + ("nb", value() ->default_value(512), "Matrix b block columns") + ; + // clang-format on + dlaf::miniapp::addSideOption(desc_commandline); + dlaf::miniapp::addUploOption(desc_commandline); + dlaf::miniapp::addOpOption(desc_commandline); + dlaf::miniapp::addDiagOption(desc_commandline); + + pika::init_params p; + p.desc_cmdline = desc_commandline; + p.rp_callback = dlaf::initResourcePartitionerHandler; + return pika::init(pika_main, argc, argv, p); +} + +namespace { +/// Returns a tuple of element generators of three matrices A(m x m), B (m x n), X (m x n), for which it +/// holds op(A) X = alpha B (alpha can be any value). +/// +/// The elements of op(A) (@p el_op_a) are chosen such that: +/// op(A)_ik = (i+1) / (k+.5) * exp(I*(2*i-k)) for the referenced elements +/// op(A)_ik = -9.9 otherwise, +/// where I = 0 for real types or I is the complex unit for complex types. +/// +/// The elements of X (@p el_x) are computed as +/// X_kj = (k+.5) / (j+2) * exp(I*(k+j)). +/// These data are typically used to check whether the result of the equation +/// performed with any algorithm is consistent with the computed values. +/// +/// Finally, the elements of B (@p el_b) should be: +/// B_ij = (Sum_k op(A)_ik * X_kj) / alpha +/// = (op(A)_ii * X_ij + (kk-1) * gamma) / alpha, +/// where gamma = (i+1) / (j+2) * exp(I*(2*i+j)), +/// kk = i+1 if op(a) is an lower triangular matrix, or +/// kk = m-i if op(a) is an lower triangular matrix. +/// Therefore +/// B_ij = (X_ij + (kk-1) * gamma) / alpha, if diag == Unit +/// B_ij = kk * gamma / alpha, otherwise. +template +linear_system_t sampleLeftTr(blas::Uplo uplo, blas::Op op, blas::Diag diag, T alpha, SizeType m) { + static_assert(std::is_arithmetic_v && !std::is_integral_v, + "it is valid just with floating point values"); + + bool op_a_lower = (uplo == blas::Uplo::Lower && op == blas::Op::NoTrans) || + (uplo == blas::Uplo::Upper && op != blas::Op::NoTrans); + + auto el_op_a = [op_a_lower, diag](const GlobalElementIndex& index) -> T { + if ((op_a_lower && index.row() < index.col()) || (!op_a_lower && index.row() > index.col()) || + (diag == blas::Diag::Unit && index.row() == index.col())) + return static_cast(-9.9); + + const T i = index.row(); + const T k = index.col(); + + return (i + static_cast(1)) / (k + static_cast(.5)); + }; + + auto el_x = [](const GlobalElementIndex& index) -> T { + const T k = index.row(); + const T j = index.col(); + + return (k + static_cast(.5)) / (j + static_cast(2)); + }; + + auto el_b = [m, alpha, diag, op_a_lower, el_x](const GlobalElementIndex& index) -> T { + const dlaf::BaseType kk = op_a_lower ? index.row() + 1 : m - index.row(); + + const T i = index.row(); + const T j = index.col(); + const T gamma = (i + 1) / (j + 2); + if (diag == blas::Diag::Unit) + return ((kk - 1) * gamma + el_x(index)) / alpha; + else + return kk * gamma / alpha; + }; + + return {el_op_a, el_b, el_x}; +} + +} diff --git a/miniapp/miniapp_triangular_solver.cpp b/miniapp/miniapp_triangular_solver.cpp index c5427dd748..9d3478e9ed 100644 --- a/miniapp/miniapp_triangular_solver.cpp +++ b/miniapp/miniapp_triangular_solver.cpp @@ -115,6 +115,7 @@ struct triangularSolverMiniapp { }; const auto side = opts.side; + DLAF_ASSERT(side == blas::Side::Left, side); const auto uplo = opts.uplo; const auto op = opts.op; const auto diag = opts.diag; diff --git a/scripts/miniapps.py b/scripts/miniapps.py index 8706eee9a1..cdc05a397f 100644 --- a/scripts/miniapps.py +++ b/scripts/miniapps.py @@ -278,6 +278,41 @@ def trsm( return cmd, env.strip() +def trmm( + system, + lib, + build_dir, + nodes, + rpn, + m_sz, + n_sz, + mb_sz, + nruns, + suffix="na", + extra_flags="", + env="", + dtype="d", +): + if n_sz == None: + n_sz = m_sz + + _check_ranks_per_node(system, lib, rpn) + [total_ranks, cores_per_rank, threads_per_rank] = _computeResourcesNeededList(system, nodes, rpn) + gr, gc = _sq_factor(total_ranks) + + if lib.startswith("dlaf"): + _check_type(dtype) + env += " OMP_NUM_THREADS=1" + app = f"{build_dir}/miniapp/miniapp_triangular_multiplication" + opts = f"--type {dtype} --m {m_sz} --n {n_sz} --mb {mb_sz} --nb {mb_sz} --grid-rows {gr} --grid-cols {gc} --nruns {nruns} {extra_flags}" + else: + raise ValueError(_err_msg(lib)) + + _checkAppExec(app) + cmd = f"{app} {opts}".strip() + f" >> trmm_{lib}_{suffix}.out 2>&1" + return cmd, env.strip() + + # lib: allowed libraries are dlaf|slate # rpn: ranks per node # diff --git a/scripts/plot_trmm_strong.py b/scripts/plot_trmm_strong.py new file mode 100755 index 0000000000..8df58b05a7 --- /dev/null +++ b/scripts/plot_trmm_strong.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +# +# Distributed Linear Algebra with Future (DLAF) +# +# Copyright (c) 2018-2023, ETH Zurich +# All rights reserved. +# +# Please, refer to the LICENSE file in the root directory. +# SPDX-License-Identifier: BSD-3-Clause +# + +import postprocess as pp + +df = pp.parse_jobs_cmdargs(description="Plot trmm strong scaling benchmarks.") + +df_grp = pp.calc_trmm_metrics(df) +pp.gen_trmm_plots_strong(df_grp) +pp.gen_trmm_plots_strong(df_grp, combine_mb=True) diff --git a/scripts/plot_trmm_weak.py b/scripts/plot_trmm_weak.py new file mode 100755 index 0000000000..0d44009e66 --- /dev/null +++ b/scripts/plot_trmm_weak.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +# +# Distributed Linear Algebra with Future (DLAF) +# +# Copyright (c) 2018-2023, ETH Zurich +# All rights reserved. +# +# Please, refer to the LICENSE file in the root directory. +# SPDX-License-Identifier: BSD-3-Clause +# + +import postprocess as pp + +df = pp.parse_jobs_cmdargs(description="Plot trmm weak scaling benchmarks.") + +df_grp = pp.calc_trmm_metrics(df) +pp.gen_trmm_plots_weak(df_grp, 1024, logx=True) +pp.gen_trmm_plots_weak(df_grp, 1024, logx=True, combine_mb=True) diff --git a/scripts/postprocess.py b/scripts/postprocess.py index 473faf55d7..4a5fda51f2 100644 --- a/scripts/postprocess.py +++ b/scripts/postprocess.py @@ -49,7 +49,7 @@ def _gen_nodes_plot( """ Args: plt_type: ppn | time - plt_routine: chol | hegst | red2band | band2trid | trid_evp | bt_band2trid | bt_red2band | trsm | evp | gevp It is used to filter data. + plt_routine: chol | hegst | red2band | band2trid | trid_evp | bt_band2trid | bt_red2band | trsm | trmm | evp | gevp It is used to filter data. title: title of the plot df: the pandas.DataFrame with the data for the plot combine_mb: bool indicates if different mb has to be included in the same plot @@ -240,7 +240,7 @@ def _parse_line_based(fout, bench_name, nodes): # fail. alg_name = bench_name[0 : bench_name.find("_dlaf")] - if alg_name in ["chol", "hegst", "trsm"]: + if alg_name in ["chol", "hegst", "trsm", "trmm"]: pstr_res = "[{run_index:d}] {time:g}s {perf:g}GFlop/s{matrix_type:optional_text} ({matrix_rows:d}, {matrix_cols:d}) ({block_rows:d}, {block_cols:d}) ({grid_rows:d}, {grid_cols:d}) {:d}{backend:optional_text}" if alg_name in ["red2band", "band2trid", "bt_band2trid", "bt_red2band"]: pstr_res = "[{run_index:d}] {time:g}s {perf:g}GFlop/s{matrix_type:optional_text} ({matrix_rows:d}, {matrix_cols:d}) ({block_rows:d}, {block_cols:d}) {band:d} ({grid_rows:d}, {grid_cols:d}) {:d}{backend:optional_text}" @@ -435,6 +435,10 @@ def calc_trsm_metrics(df): return _calc_metrics(["matrix_rows", "matrix_cols", "block_rows", "nodes", "bench_name"], df) +def calc_trmm_metrics(df): + return _calc_metrics(["matrix_rows", "matrix_cols", "block_rows", "nodes", "bench_name"], df) + + def calc_gen2std_metrics(df): return _calc_metrics(["matrix_rows", "block_rows", "nodes", "bench_name"], df) @@ -500,7 +504,7 @@ def _gen_plot( Args: scaling strong | weak name: name of the routine to be included in the title - routine: chol | hegst | red2band | band2trid | trid_evp | bt_band2trid | trsm | evp | gevp + routine: chol | hegst | red2band | band2trid | trid_evp | bt_band2trid | trsm | trmm | evp | gevp combine_mb: bool indicates if different mb has to be included in the same plot size_type: m | mn It indicates which sizes are relevant. customize_ppn: function accepting the two arguments fig and ax for ppn plot customization @@ -779,6 +783,74 @@ def gen_trsm_plots_weak( ) +def gen_trmm_plots_strong( + df, + logx=False, + combine_mb=False, + filename_suffix=None, + customize_ppn=add_basic_legend, + customize_time=add_basic_legend, + **proxy_args, +): + """ + Args: + customize_ppn: function accepting the two arguments fig and ax for ppn plot customization + customize_time: function accepting the two arguments fig and ax for time plot customization + Default customization (ppn and time): add_basic_legend. They can be set to "None" to remove the legend. + """ + _gen_plot( + scaling="strong", + name="TRMM", + routine="trmm", + filename="trmm", + size_type="mn", + df=df, + logx=logx, + combine_mb=combine_mb, + filename_suffix=filename_suffix, + ppn_plot=True, + customize_ppn=customize_ppn, + time_plot=True, + customize_time=customize_time, + **proxy_args, + ) + + +def gen_trmm_plots_weak( + df, + weak_rt_approx, + logx=False, + combine_mb=False, + filename_suffix=None, + customize_ppn=add_basic_legend, + customize_time=add_basic_legend, + **proxy_args, +): + """ + Args: + customize_ppn: function accepting the two arguments fig and ax for ppn plot customization + customize_time: function accepting the two arguments fig and ax for time plot customization + Default customization (ppn and time): add_basic_legend. They can be set to "None" to remove the legend. + """ + _gen_plot( + scaling="weak", + name="TRMM", + routine="trmm", + filename="trmm", + size_type="mn", + df=df, + logx=logx, + combine_mb=combine_mb, + filename_suffix=filename_suffix, + ppn_plot=True, + customize_ppn=customize_ppn, + time_plot=True, + customize_time=customize_time, + weak_rt_approx=weak_rt_approx, + **proxy_args, + ) + + def gen_gen2std_plots_strong( df, logx=None, From e0b8b9ee83112edd3ccdf19260ce9bb1db2b668c Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Mon, 25 Sep 2023 10:58:15 +0200 Subject: [PATCH 3/7] merge-squashed trisolver dist change to reduce gemm cost --- .../dlaf/eigensolver/tridiag_solver/merge.h | 143 +++++++++++++----- 1 file changed, 105 insertions(+), 38 deletions(-) diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h index e61c74f7c5..079aef8423 100644 --- a/include/dlaf/eigensolver/tridiag_solver/merge.h +++ b/include/dlaf/eigensolver/tridiag_solver/merge.h @@ -500,6 +500,35 @@ auto stablePartitionIndexForDeflationArrays(const matrix::Distribution& dist_sub index_sorted_coltype[to_sizet(jjj_el)] = jj_el; } + // TODO manage edge cases + std::array n_udl = [&]() { + SizeType first_dense; + for (first_dense = 0; first_dense < n; ++first_dense) { + const SizeType initial_el = index_sorted_coltype[to_sizet(first_dense)]; + const ColType coltype = types[to_sizet(initial_el)]; + if (ColType::UpperHalf != coltype) + break; + } + + SizeType last_dense; + for (last_dense = n - 1; last_dense >= 0; --last_dense) { + const SizeType initial_el = index_sorted_coltype[to_sizet(last_dense)]; + const ColType coltype = types[to_sizet(initial_el)]; + if (ColType::LowerHalf != coltype && ColType::Deflated != coltype) + break; + } + + SizeType last_lower; + for (last_lower = n - 1; last_lower >= 0; --last_lower) { + const SizeType initial_el = index_sorted_coltype[to_sizet(last_lower)]; + const ColType coltype = types[to_sizet(initial_el)]; + if (ColType::Deflated != coltype) + break; + } + + return std::array{first_dense, last_dense + 1, last_lower + 1}; + }(); + // invert i3 and store it in i2 (temporary) // i3 (in) : initial <--- deflated // i2 (out) : deflated <--- initial @@ -535,7 +564,7 @@ auto stablePartitionIndexForDeflationArrays(const matrix::Distribution& dist_sub for (SizeType i = 0; i < n; ++i) i2[i6[i]] = i; - return std::tuple(k, k_lc); + return std::tuple(k, k_lc, n_udl); } template @@ -1308,33 +1337,14 @@ void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const S const SizeType* i2 = i2_tiles_arr[0].get().ptr(); const SizeType* i6 = i6_tiles_arr[0].get().ptr(); - // STEP 0a: Fill ones for deflated Eigenvectors and copy related Eigenvalues (single-thread) - // Note: this step is completely independent from the rest, but it is small and it is going - // to be dropped soon. + // STEP 0a: Permute eigenvalues for deflated eigenvectors (single-thread) // Note: use last threads that in principle should have less work to do if (k < n && thread_idx == nthreads - 1) { const T* eval_initial_ptr = d_tiles[0].get().ptr(); T* eval_ptr = eval_tiles[0].ptr(); for (SizeType j_el_lc = k_lc; j_el_lc < n_el_lc; ++j_el_lc) { - const SizeType j_el = - dist_sub.global_element_from_local_element(j_el_lc); - const SizeType i_el = j_el; - - if (dist_sub.rank_index().row() == dist_sub.rank_global_element(i_el)) { - const SizeType i_el_lc = - dist_sub.local_element_from_global_element(i_el); - const LocalTileIndex - i_lc{dist_sub.local_tile_from_local_element(i_el_lc), - dist_sub.local_tile_from_local_element(j_el_lc)}; - const SizeType linear_lc = dist_extra::local_tile_linear_index(dist_sub, i_lc); - const TileElementIndex - ij_el_tl{dist_sub.tile_element_from_local_element(i_el_lc), - dist_sub.tile_element_from_local_element(j_el_lc)}; - - evec_tiles[to_sizet(linear_lc)](ij_el_tl) = T{1}; - } - + const SizeType j_el = dist_sub.globalElementFromLocalElement(j_el_lc); eval_ptr[j_el] = eval_initial_ptr[i6[j_el]]; } } @@ -1650,19 +1660,31 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid, WorkSpace& ws, WorkSpaceHost& ws_h, DistWorkSpaceHostMirror& ws_hm) { namespace ex = pika::execution::experimental; + using matrix::internal::distribution::global_tile_element_distance; using pika::execution::thread_priority; - const matrix::Distribution& dist_evecs = ws.e0.distribution(); + const matrix::Distribution& dist = ws.e0.distribution(); + + const GlobalElementIndex sub_offset{i_begin * dist.blockSize().rows(), + i_begin * dist.blockSize().cols()}; + const matrix::Distribution dist_sub( + dist, {sub_offset, + { + global_tile_element_distance(dist, i_begin, i_end), + global_tile_element_distance(dist, i_begin, i_end), + }}); // Calculate the size of the upper subproblem - const SizeType n1 = dist_evecs.globalTileElementDistance(i_begin, i_split); + const SizeType n = dist.globalTileElementDistance(i_begin, i_end); + const SizeType n_upper = dist.globalTileElementDistance(i_begin, i_split); + const SizeType n_lower = dist.globalTileElementDistance(i_split, i_end); // The local size of the subproblem const GlobalTileIndex idx_gl_begin(i_begin, i_begin); - const LocalTileIndex idx_loc_begin{dist_evecs.nextLocalTileFromGlobalTile(i_begin), - dist_evecs.nextLocalTileFromGlobalTile(i_begin)}; - const LocalTileIndex idx_loc_end{dist_evecs.nextLocalTileFromGlobalTile(i_end), - dist_evecs.nextLocalTileFromGlobalTile(i_end)}; + const LocalTileIndex idx_loc_begin{dist.nextLocalTileFromGlobalTile(i_begin), + dist.nextLocalTileFromGlobalTile(i_begin)}; + const LocalTileIndex idx_loc_end{dist.nextLocalTileFromGlobalTile(i_end), + dist.nextLocalTileFromGlobalTile(i_end)}; const LocalTileSize sz_loc_tiles = idx_loc_end - idx_loc_begin; const LocalTileIndex idx_begin_tiles_vec(i_begin, 0); const LocalTileSize sz_tiles_vec(i_end - i_begin, 1); @@ -1689,7 +1711,7 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid, } // Update indices of second sub-problem - addIndex(i_split, i_end, n1, ws_h.i1); + addIndex(i_split, i_end, n_upper, ws_h.i1); // Step #1 // @@ -1699,7 +1721,7 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid, // - deflate `d`, `z` and `c` // - apply Givens rotations to `Q` - `evecs` // - sortIndex(i_begin, i_end, ex::just(n1), ws_h.d0, ws_h.i1, ws_hm.i2); + sortIndex(i_begin, i_end, ex::just(n_upper), ws_h.d0, ws_h.i1, ws_hm.i2); auto rots = applyDeflation(i_begin, i_end, scaled_rho, std::move(tol), ws_hm.i2, ws_h.d0, ws_hm.z0, ws_h.c); @@ -1725,11 +1747,12 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid, // - reorder `d0 -> d1`, `z0 -> z1`, using `i3` such that deflated entries are at the bottom. // - solve the rank-1 problem and save eigenvalues in `d0` and `d1` (copy) and eigenvectors in `e2`. // - set deflated diagonal entries of `U` to 1 (temporary solution until optimized GEMM is implemented) - auto [k_unique, k_lc] = - ex::split_tuple(stablePartitionIndexForDeflation(dist_evecs, i_begin, i_end, ws_h.c, ws_h.d0, - ws_hm.i2, ws_h.i3, ws_hm.i5, ws_h.i4, ws_hm.i6)); + auto [k_unique, k_lc_unique, n_udl] = + ex::split_tuple(stablePartitionIndexForDeflation(dist, i_begin, i_end, ws_h.c, ws_h.d0, ws_hm.i2, + ws_h.i3, ws_hm.i5, ws_h.i4, ws_hm.i6)); auto k = ex::split(std::move(k_unique)); + auto k_lc = ex::split(std::move(k_lc_unique)); // Reorder Eigenvectors using dlaf::permutations::internal::permuteJustLocal; @@ -1749,18 +1772,62 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid, // Note: here ws_hm.z0 is used as a contiguous buffer for the laed4 call matrix::util::set0(thread_priority::normal, idx_loc_begin, sz_loc_tiles, ws_hm.e2); - solveRank1ProblemDist(row_task_chain(), col_task_chain(), i_begin, i_end, k, std::move(k_lc), + solveRank1ProblemDist(row_task_chain(), col_task_chain(), i_begin, i_end, k, k_lc, std::move(scaled_rho), ws_hm.d1, ws_hm.z1, ws_h.d0, ws_h.i4, ws_hm.i6, ws_hm.i2, ws_hm.e2); + copy(idx_loc_begin, sz_loc_tiles, ws_hm.e2, ws.e2); // Step #3: Eigenvectors of the tridiagonal system: Q * U // // The eigenvectors resulting from the multiplication are already in the order of the eigenvalues as // prepared for the deflated system. - copy(idx_loc_begin, sz_loc_tiles, ws_hm.e2, ws.e2); - dlaf::multiplication::internal::generalSubMatrix(grid, row_task_chain, col_task_chain, - i_begin, i_end, T(1), ws.e1, ws.e2, T(0), - ws.e0); + ex::start_detached( + ex::when_all(std::move(k_lc), std::move(n_udl), row_task_chain(), col_task_chain()) | + ex::transfer(dlaf::internal::getBackendScheduler()) | + ex::then([dist_sub, sub_offset, n, n_upper, n_lower, e0 = ws.e0.subPipeline(), + e1 = ws.e1.subPipelineConst(), e2 = ws.e2.subPipelineConst()]( + const SizeType k_lc, const std::array& n_udl, auto&& row_comm_wrapper, + auto&& col_comm_wrapper) mutable { + using dlaf::matrix::internal::MatrixRef; + + common::Pipeline sub_comm_row(row_comm_wrapper.get()); + common::Pipeline sub_comm_col(col_comm_wrapper.get()); + + const auto [a, b, c] = n_udl; + + using GEMM = dlaf::multiplication::internal::General; + { + MatrixRef e1_sub(e1, {sub_offset, {n_upper, b}}); + MatrixRef e2_sub(e2, {sub_offset, {b, c}}); + MatrixRef e0_sub(e0, {sub_offset, {n_upper, c}}); + + GEMM::callNN(sub_comm_row, sub_comm_col, T(1), e1_sub, e2_sub, T(0), e0_sub); + } + + { + MatrixRef e1_sub(e1, {{sub_offset.row() + n_upper, sub_offset.col() + a}, + {n_lower, c - a}}); + MatrixRef e2_sub(e2, {{sub_offset.row() + a, sub_offset.col()}, {c - a, c}}); + MatrixRef e0_sub(e0, {{sub_offset.row() + n_upper, sub_offset.col()}, {n_lower, c}}); + + GEMM::callNN(sub_comm_row, sub_comm_col, T(1), e1_sub, e2_sub, T(0), e0_sub); + } + + // copy deflated from e1 to e0 + if (k_lc < dist_sub.localSize().cols()) { + const SizeType k = dist_sub.globalElementFromLocalElement(k_lc); + const matrix::internal::SubMatrixSpec deflated_submat{{sub_offset.row(), sub_offset.col() + k}, + {n, n - k}}; + MatrixRef sub_e0(e0, deflated_submat); + MatrixRef sub_e1(e1, deflated_submat); + + copy(sub_e1, sub_e0); + } + + namespace tt = pika::this_thread::experimental; + tt::sync_wait(sub_comm_row()); + tt::sync_wait(sub_comm_col()); + })); // Step #4: Final permutation to sort eigenvalues and eigenvectors // From 4f3150742468f23a9b9e50bb210a323fa312100d Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Tue, 12 Dec 2023 12:13:59 +0100 Subject: [PATCH 4/7] start factoring out gemm --- .../dlaf/eigensolver/tridiag_solver/merge.h | 107 ++++++++++-------- 1 file changed, 60 insertions(+), 47 deletions(-) diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h index 079aef8423..20601ac871 100644 --- a/include/dlaf/eigensolver/tridiag_solver/merge.h +++ b/include/dlaf/eigensolver/tridiag_solver/merge.h @@ -1650,6 +1650,64 @@ void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const S })); } +template +void multiplyEigenvectors(const matrix::Distribution& dist_sub, + common::Pipeline& row_task_chain, + common::Pipeline& col_task_chain, + const GlobalElementIndex sub_offset, const SizeType n, const SizeType n_upper, + const SizeType n_lower, Matrix& e0, Matrix& e1, Matrix& e2, + KLcSender&& k_lc, UDLSenders&& n_udl) { + namespace ex = pika::execution::experimental; + + ex::start_detached( + ex::when_all(std::forward(k_lc), std::forward(n_udl), row_task_chain(), + col_task_chain()) | + ex::transfer(dlaf::internal::getBackendScheduler()) | + ex::then([dist_sub, sub_offset, n, n_upper, n_lower, e0 = e0.subPipeline(), + e1 = e1.subPipelineConst(), + e2 = e2.subPipelineConst()](const SizeType k_lc, const std::array& n_udl, + auto&& row_comm_wrapper, auto&& col_comm_wrapper) mutable { + using dlaf::matrix::internal::MatrixRef; + + common::Pipeline sub_comm_row(row_comm_wrapper.get()); + common::Pipeline sub_comm_col(col_comm_wrapper.get()); + + const auto [a, b, c] = n_udl; + + using GEMM = dlaf::multiplication::internal::General; + { + MatrixRef e1_sub(e1, {sub_offset, {n_upper, b}}); + MatrixRef e2_sub(e2, {sub_offset, {b, c}}); + MatrixRef e0_sub(e0, {sub_offset, {n_upper, c}}); + + GEMM::callNN(sub_comm_row, sub_comm_col, T(1), e1_sub, e2_sub, T(0), e0_sub); + } + + { + MatrixRef e1_sub(e1, {{sub_offset.row() + n_upper, sub_offset.col() + a}, + {n_lower, c - a}}); + MatrixRef e2_sub(e2, {{sub_offset.row() + a, sub_offset.col()}, {c - a, c}}); + MatrixRef e0_sub(e0, {{sub_offset.row() + n_upper, sub_offset.col()}, {n_lower, c}}); + + GEMM::callNN(sub_comm_row, sub_comm_col, T(1), e1_sub, e2_sub, T(0), e0_sub); + } + + if (k_lc < dist_sub.localSize().cols()) { + const SizeType k = dist_sub.globalElementFromLocalElement(k_lc); + const matrix::internal::SubMatrixSpec deflated_submat{{sub_offset.row(), sub_offset.col() + k}, + {n, n - k}}; + MatrixRef sub_e0(e0, deflated_submat); + MatrixRef sub_e1(e1, deflated_submat); + + copy(sub_e1, sub_e0); + } + + namespace tt = pika::this_thread::experimental; + tt::sync_wait(sub_comm_row()); + tt::sync_wait(sub_comm_col()); + })); +} + // Distributed version of the tridiagonal solver on CPUs template void mergeDistSubproblems(comm::CommunicatorGrid grid, @@ -1781,53 +1839,8 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid, // // The eigenvectors resulting from the multiplication are already in the order of the eigenvalues as // prepared for the deflated system. - ex::start_detached( - ex::when_all(std::move(k_lc), std::move(n_udl), row_task_chain(), col_task_chain()) | - ex::transfer(dlaf::internal::getBackendScheduler()) | - ex::then([dist_sub, sub_offset, n, n_upper, n_lower, e0 = ws.e0.subPipeline(), - e1 = ws.e1.subPipelineConst(), e2 = ws.e2.subPipelineConst()]( - const SizeType k_lc, const std::array& n_udl, auto&& row_comm_wrapper, - auto&& col_comm_wrapper) mutable { - using dlaf::matrix::internal::MatrixRef; - - common::Pipeline sub_comm_row(row_comm_wrapper.get()); - common::Pipeline sub_comm_col(col_comm_wrapper.get()); - - const auto [a, b, c] = n_udl; - - using GEMM = dlaf::multiplication::internal::General; - { - MatrixRef e1_sub(e1, {sub_offset, {n_upper, b}}); - MatrixRef e2_sub(e2, {sub_offset, {b, c}}); - MatrixRef e0_sub(e0, {sub_offset, {n_upper, c}}); - - GEMM::callNN(sub_comm_row, sub_comm_col, T(1), e1_sub, e2_sub, T(0), e0_sub); - } - - { - MatrixRef e1_sub(e1, {{sub_offset.row() + n_upper, sub_offset.col() + a}, - {n_lower, c - a}}); - MatrixRef e2_sub(e2, {{sub_offset.row() + a, sub_offset.col()}, {c - a, c}}); - MatrixRef e0_sub(e0, {{sub_offset.row() + n_upper, sub_offset.col()}, {n_lower, c}}); - - GEMM::callNN(sub_comm_row, sub_comm_col, T(1), e1_sub, e2_sub, T(0), e0_sub); - } - - // copy deflated from e1 to e0 - if (k_lc < dist_sub.localSize().cols()) { - const SizeType k = dist_sub.globalElementFromLocalElement(k_lc); - const matrix::internal::SubMatrixSpec deflated_submat{{sub_offset.row(), sub_offset.col() + k}, - {n, n - k}}; - MatrixRef sub_e0(e0, deflated_submat); - MatrixRef sub_e1(e1, deflated_submat); - - copy(sub_e1, sub_e0); - } - - namespace tt = pika::this_thread::experimental; - tt::sync_wait(sub_comm_row()); - tt::sync_wait(sub_comm_col()); - })); + multiplyEigenvectors(dist_sub, row_task_chain, col_task_chain, sub_offset, n, n_upper, n_lower, + ws.e0, ws.e1, ws.e2, std::move(k_lc), std::move(n_udl)); // Step #4: Final permutation to sort eigenvalues and eigenvectors // From a824d404bcc43a67f9cba99ce3b8ec45195c88e1 Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Tue, 12 Dec 2023 16:33:18 +0100 Subject: [PATCH 5/7] doc + minor changes --- .../dlaf/eigensolver/tridiag_solver/merge.h | 88 +++++++++++++++++-- 1 file changed, 79 insertions(+), 9 deletions(-) diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h index 20601ac871..d7a3566f6c 100644 --- a/include/dlaf/eigensolver/tridiag_solver/merge.h +++ b/include/dlaf/eigensolver/tridiag_solver/merge.h @@ -412,6 +412,7 @@ auto stablePartitionIndexForDeflationArrays(const SizeType n, const ColType* typ // // @return k number of non-deflated eigenvectors // @return k_local number of local non-deflated eigenvectors +// @return n_udl tuple with global indices for [first_dense, last_dense, last_lower] template auto stablePartitionIndexForDeflationArrays(const matrix::Distribution& dist_sub, const ColType* types, const T* evals, SizeType* perm_sorted, @@ -500,7 +501,6 @@ auto stablePartitionIndexForDeflationArrays(const matrix::Distribution& dist_sub index_sorted_coltype[to_sizet(jjj_el)] = jj_el; } - // TODO manage edge cases std::array n_udl = [&]() { SizeType first_dense; for (first_dense = 0; first_dense < n; ++first_dense) { @@ -510,14 +510,11 @@ auto stablePartitionIndexForDeflationArrays(const matrix::Distribution& dist_sub break; } - SizeType last_dense; - for (last_dense = n - 1; last_dense >= 0; --last_dense) { - const SizeType initial_el = index_sorted_coltype[to_sizet(last_dense)]; - const ColType coltype = types[to_sizet(initial_el)]; - if (ColType::LowerHalf != coltype && ColType::Deflated != coltype) - break; - } - + // Note: + // Eigenvectors will be sorted according index_sorted_coltype, i.e. local sort by coltype. + // Since it is a local order, it is legit if deflated are globally interlaced with other column + // types. However, GEMM will be able to skip just the last global contiguous group of deflated + // eigenvectors, but not the ones interlaced with others. SizeType last_lower; for (last_lower = n - 1; last_lower >= 0; --last_lower) { const SizeType initial_el = index_sorted_coltype[to_sizet(last_lower)]; @@ -526,6 +523,14 @@ auto stablePartitionIndexForDeflationArrays(const matrix::Distribution& dist_sub break; } + SizeType last_dense; + for (last_dense = last_lower; last_dense >= 0; --last_dense) { + const SizeType initial_el = index_sorted_coltype[to_sizet(last_dense)]; + const ColType coltype = types[to_sizet(initial_el)]; + if (ColType::LowerHalf != coltype && ColType::Deflated != coltype) + break; + } + return std::array{first_dense, last_dense + 1, last_lower + 1}; }(); @@ -1657,6 +1662,71 @@ void multiplyEigenvectors(const matrix::Distribution& dist_sub, const GlobalElementIndex sub_offset, const SizeType n, const SizeType n_upper, const SizeType n_lower, Matrix& e0, Matrix& e1, Matrix& e2, KLcSender&& k_lc, UDLSenders&& n_udl) { + // Note: + // This function computes E0 = E1 . E2 + // + // where E1 is the matrix with eigenvectors and it looks like this + // + // ┌──────────┐ k + // │ b │ │ + // ▼ + // ┌── ┌───┬──────┬─┬────┐ + // │ │UUU│DDDDDD│ │XXXX│ + // │ │UUU│DDDDDD│ │XXXX│ + // n_upper │ │UUU│DDDDDD│ │XXXX│ + // │ │UUU│DDDDDD│ │XXXX│ + // │ │UUU│DDDDDD│ │XXXX│ + // ├── ├───┼──────┼─┤XXXX│ + // │ │ │DDDDDD│L│XXXX│ + // n_lower │ │ │DDDDDD│L│XXXX│ + // │ │ │DDDDDD│L│XXXX│ + // └── └───┴──────┴─┴────┘ + // │ a │ + // └───┘ + // │ c │ + // └────────────┘ + // + // Where (a, b, c) are the values from n_udl + // + // Note: + // E1 matrix does not have all deflated values at the end, indeed part of them are "interlaced" with + // others. The GEMM will perform anyway a computation for deflated eigenvectors (which are zeroed out) + // while the copy step will be performed at "local" level, so even interlaced ones will get copied + // in the right spot. + // + // The multiplication in two different steps in order to skip zero blocks of the matrix, created by + // the grouping of eigenvectors of different lengths (UPPER, DENSE and LOWER). + // + // 1. GEMM1 = TL . TOP + // 2. GEMM2 = BR . BOTTOM + // 3. copy DEFLATED + // + // ┌────────────┬────┐ + // │ │ │ + // │ │ │ + // │ T O P │ │ + // │ │ │ + // │ │ │ + // ├────────────┤ │ + // │ │ │ + // │ │ │ + // │B O T T O M │ │ + // │ │ │ + // └────────────┴────┘ + // + // ┌──────────┬─┬────┐ ┌────────────┬────┐ + // │ │0│ │ │ │ │ + // │ │0│ D │ │ │ │ + // │ TL │0│ E │ │ GEMM 1 │ C │ + // │ │0│ F │ │ │ │ + // │ │0│ L │ │ │ O │ + // ├───┬──────┴─┤ A │ ├────────────┤ │ + // │000│ │ T │ │ │ P │ + // │000│ │ E │ │ │ │ + // │000│ BR │ D │ │ GEMM 2 │ Y │ + // │000│ │ │ │ │ │ + // └───┴────────┴────┘ └────────────┴────┘ + namespace ex = pika::execution::experimental; ex::start_detached( From c42260279613a70611dc1fc44c785b5f2908fad7 Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Tue, 12 Dec 2023 16:54:05 +0100 Subject: [PATCH 6/7] make gemm scheduler hp + fix doc + minor changes --- .../dlaf/eigensolver/tridiag_solver/merge.h | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h index d7a3566f6c..be8cdc81b3 100644 --- a/include/dlaf/eigensolver/tridiag_solver/merge.h +++ b/include/dlaf/eigensolver/tridiag_solver/merge.h @@ -1656,10 +1656,9 @@ void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const S } template -void multiplyEigenvectors(const matrix::Distribution& dist_sub, +void multiplyEigenvectors(const GlobalElementIndex sub_offset, const matrix::Distribution& dist_sub, common::Pipeline& row_task_chain, - common::Pipeline& col_task_chain, - const GlobalElementIndex sub_offset, const SizeType n, const SizeType n_upper, + common::Pipeline& col_task_chain, const SizeType n_upper, const SizeType n_lower, Matrix& e0, Matrix& e1, Matrix& e2, KLcSender&& k_lc, UDLSenders&& n_udl) { // Note: @@ -1728,12 +1727,13 @@ void multiplyEigenvectors(const matrix::Distribution& dist_sub, // └───┴────────┴────┘ └────────────┴────┘ namespace ex = pika::execution::experimental; + using pika::execution::thread_priority; ex::start_detached( ex::when_all(std::forward(k_lc), std::forward(n_udl), row_task_chain(), col_task_chain()) | - ex::transfer(dlaf::internal::getBackendScheduler()) | - ex::then([dist_sub, sub_offset, n, n_upper, n_lower, e0 = e0.subPipeline(), + ex::transfer(dlaf::internal::getBackendScheduler(thread_priority::high)) | + ex::then([dist_sub, sub_offset, n_upper, n_lower, e0 = e0.subPipeline(), e1 = e1.subPipelineConst(), e2 = e2.subPipelineConst()](const SizeType k_lc, const std::array& n_udl, auto&& row_comm_wrapper, auto&& col_comm_wrapper) mutable { @@ -1742,6 +1742,7 @@ void multiplyEigenvectors(const matrix::Distribution& dist_sub, common::Pipeline sub_comm_row(row_comm_wrapper.get()); common::Pipeline sub_comm_col(col_comm_wrapper.get()); + const SizeType n = dist_sub.size().cols(); const auto [a, b, c] = n_udl; using GEMM = dlaf::multiplication::internal::General; @@ -1793,8 +1794,8 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid, const matrix::Distribution& dist = ws.e0.distribution(); - const GlobalElementIndex sub_offset{i_begin * dist.blockSize().rows(), - i_begin * dist.blockSize().cols()}; + const GlobalElementIndex sub_offset{i_begin * dist.tile_size().rows(), + i_begin * dist.tile_size().cols()}; const matrix::Distribution dist_sub( dist, {sub_offset, { @@ -1898,7 +1899,9 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid, applyIndex(i_begin, i_end, ws_h.i3, ws_h.d0, ws_hm.d1); applyIndex(i_begin, i_end, ws_h.i3, ws_hm.z0, ws_hm.z1); - // Note: here ws_hm.z0 is used as a contiguous buffer for the laed4 call + // Note: + // set0 is required because deflated eigenvectors rows won't be touched in rank1 and so they will be + // neutral when used in GEMM (copy will take care of them later) matrix::util::set0(thread_priority::normal, idx_loc_begin, sz_loc_tiles, ws_hm.e2); solveRank1ProblemDist(row_task_chain(), col_task_chain(), i_begin, i_end, k, k_lc, std::move(scaled_rho), ws_hm.d1, ws_hm.z1, ws_h.d0, ws_h.i4, ws_hm.i6, ws_hm.i2, @@ -1909,8 +1912,8 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid, // // The eigenvectors resulting from the multiplication are already in the order of the eigenvalues as // prepared for the deflated system. - multiplyEigenvectors(dist_sub, row_task_chain, col_task_chain, sub_offset, n, n_upper, n_lower, - ws.e0, ws.e1, ws.e2, std::move(k_lc), std::move(n_udl)); + multiplyEigenvectors(sub_offset, dist_sub, row_task_chain, col_task_chain, n_upper, n_lower, ws.e0, + ws.e1, ws.e2, std::move(k_lc), std::move(n_udl)); // Step #4: Final permutation to sort eigenvalues and eigenvectors // From 4eb08c66efcde84fe5db701109a683c9c6e8b172 Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Tue, 12 Dec 2023 17:03:28 +0100 Subject: [PATCH 7/7] snake case --- include/dlaf/eigensolver/tridiag_solver/merge.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h index be8cdc81b3..0a535c147a 100644 --- a/include/dlaf/eigensolver/tridiag_solver/merge.h +++ b/include/dlaf/eigensolver/tridiag_solver/merge.h @@ -1763,8 +1763,8 @@ void multiplyEigenvectors(const GlobalElementIndex sub_offset, const matrix::Dis GEMM::callNN(sub_comm_row, sub_comm_col, T(1), e1_sub, e2_sub, T(0), e0_sub); } - if (k_lc < dist_sub.localSize().cols()) { - const SizeType k = dist_sub.globalElementFromLocalElement(k_lc); + if (k_lc < dist_sub.local_size().cols()) { + const SizeType k = dist_sub.global_element_from_local_element(k_lc); const matrix::internal::SubMatrixSpec deflated_submat{{sub_offset.row(), sub_offset.col() + k}, {n, n - k}}; MatrixRef sub_e0(e0, deflated_submat); @@ -1804,16 +1804,15 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid, }}); // Calculate the size of the upper subproblem - const SizeType n = dist.globalTileElementDistance(i_begin, i_end); - const SizeType n_upper = dist.globalTileElementDistance(i_begin, i_split); - const SizeType n_lower = dist.globalTileElementDistance(i_split, i_end); + const SizeType n_upper = global_tile_element_distance(dist, i_begin, i_split); + const SizeType n_lower = global_tile_element_distance(dist, i_split, i_end); // The local size of the subproblem const GlobalTileIndex idx_gl_begin(i_begin, i_begin); - const LocalTileIndex idx_loc_begin{dist.nextLocalTileFromGlobalTile(i_begin), - dist.nextLocalTileFromGlobalTile(i_begin)}; - const LocalTileIndex idx_loc_end{dist.nextLocalTileFromGlobalTile(i_end), - dist.nextLocalTileFromGlobalTile(i_end)}; + const LocalTileIndex idx_loc_begin{dist.next_local_tile_from_global_tile(i_begin), + dist.next_local_tile_from_global_tile(i_begin)}; + const LocalTileIndex idx_loc_end{dist.next_local_tile_from_global_tile(i_end), + dist.next_local_tile_from_global_tile(i_end)}; const LocalTileSize sz_loc_tiles = idx_loc_end - idx_loc_begin; const LocalTileIndex idx_begin_tiles_vec(i_begin, 0); const LocalTileSize sz_tiles_vec(i_end - i_begin, 1);