From 2425d8643275b51dbf6282d6854d9f03c061cd55 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <9337627+albestro@users.noreply.github.com>
Date: Tue, 12 Dec 2023 13:42:18 +0100
Subject: [PATCH 1/7] TridiagSolver (dist): STEP2b Make rank1 work just on
 local non-deflated eigenvectors (multi-threaded) (#997)

---
 .../dlaf/eigensolver/tridiag_solver/merge.h   | 684 ++++++++++--------
 1 file changed, 367 insertions(+), 317 deletions(-)
diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h
index 4fd051b36a..e61c74f7c5 100644
--- a/include/dlaf/eigensolver/tridiag_solver/merge.h
+++ b/include/dlaf/eigensolver/tridiag_solver/merge.h
@@ -1257,17 +1257,9 @@ void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const S
                                         comm, req));
   };
 
-  // Note: at least two column of tiles per-worker, in the range [1, getTridiagRank1NWorkers()]
-  const std::size_t nthreads = [nrtiles = dist_sub.local_nr_tiles().cols()]() {
-    const std::size_t min_workers = 1;
-    const std::size_t available_workers = getTridiagRank1NWorkers();
-    const std::size_t ideal_workers = util::ceilDiv(to_sizet(nrtiles), to_sizet(2));
-    return std::clamp(ideal_workers, min_workers, available_workers);
-  }();
-
+  const auto hp_scheduler = di::getBackendScheduler<Backend::MC>(thread_priority::high);
   ex::start_detached(
-      ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nthreads)),
-                   std::forward<CommSender>(row_comm), std::forward<CommSender>(col_comm),
+      ex::when_all(std::forward<CommSender>(row_comm), std::forward<CommSender>(col_comm),
                    std::forward<KSender>(k), std::forward<KLcSender>(k_lc), std::forward<RhoSender>(rho),
                    ex::when_all_vector(tc.read(d)), ex::when_all_vector(tc.readwrite(z)),
                    ex::when_all_vector(tc.readwrite(evals)), ex::when_all_vector(tc.read(i4)),
@@ -1276,317 +1268,375 @@ void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const S
                    // additional workspaces
                    ex::just(std::vector<memory::MemoryView<T, Device::CPU>>()),
                    ex::just(memory::MemoryView<T, Device::CPU>())) |
-      ex::transfer(di::getBackendScheduler<Backend::MC>(thread_priority::high)) |
-      ex::bulk(nthreads, [nthreads, n, dist_sub, bcast_evals, all_reduce_in_place](
-                             const std::size_t thread_idx, auto& barrier_ptr, auto& row_comm_wrapper,
-                             auto& col_comm_wrapper, const SizeType k, const SizeType k_lc,
-                             const auto& rho, const auto& d_tiles, auto& z_tiles, const auto& eval_tiles,
-                             const auto& i4_tiles_arr, const auto& i6_tiles_arr,
-                             const auto& i2_tiles_arr, const auto& evec_tiles, auto& ws_cols,
-                             auto& ws_row) {
-        using dlaf::comm::internal::transformMPI;
-
-        common::Pipeline<comm::Communicator> row_comm_chain(row_comm_wrapper.get());
-        const dlaf::comm::Communicator& col_comm = col_comm_wrapper.get();
-
-        const SizeType m_el_lc = dist_sub.local_size().rows();
-        const SizeType n_el_lc = dist_sub.local_size().cols();
-
-        const auto barrier_busy_wait = getTridiagRank1BarrierBusyWait();
-
-        const SizeType* i4 = i4_tiles_arr[0].get().ptr();
-        const SizeType* i2 = i2_tiles_arr[0].get().ptr();
-        const SizeType* i6 = i6_tiles_arr[0].get().ptr();
-
-        // STEP 0a: Fill ones for deflated Eigenvectors and copy related Eigenvalues (single-thread)
-        // Note: this step is completely independent from the rest, but it is small and it is going
-        // to be dropped soon.
-        // Note: use last threads that in principle should have less work to do
-        if (k < n && thread_idx == nthreads - 1) {
-          const T* eval_initial_ptr = d_tiles[0].get().ptr();
-          T* eval_ptr = eval_tiles[0].ptr();
-
-          for (SizeType j_el_lc = k_lc; j_el_lc < n_el_lc; ++j_el_lc) {
-            const SizeType j_el = dist_sub.global_element_from_local_element<Coord::Col>(j_el_lc);
-            const SizeType i_el = j_el;
-
-            if (dist_sub.rank_index().row() == dist_sub.rank_global_element<Coord::Row>(i_el)) {
-              const SizeType i_el_lc = dist_sub.local_element_from_global_element<Coord::Row>(i_el);
-              const LocalTileIndex i_lc{dist_sub.local_tile_from_local_element<Coord::Row>(i_el_lc),
-                                        dist_sub.local_tile_from_local_element<Coord::Col>(j_el_lc)};
-              const SizeType linear_lc = dist_extra::local_tile_linear_index(dist_sub, i_lc);
-              const TileElementIndex
-                  ij_el_tl{dist_sub.tile_element_from_local_element<Coord::Row>(i_el_lc),
-                           dist_sub.tile_element_from_local_element<Coord::Col>(j_el_lc)};
-
-              evec_tiles[to_sizet(linear_lc)](ij_el_tl) = T{1};
-            }
-
-            eval_ptr[j_el] = eval_initial_ptr[i6[j_el]];
-          }
-        }
-
-        // STEP 0b: Initialize workspaces (single-thread)
-        if (thread_idx == 0) {
-          // Note:
-          // - nthreads are used for both LAED4 and weight calculation (one per worker thread)
-          // - last one is used for reducing weights from all workers
-          ws_cols.reserve(nthreads + 1);
-
-          // Note:
-          // Considering that
-          // - LAED4 requires working on k elements
-          // - Weight computation requires working on m_el_lc
-          //
-          // and they are needed at two steps that cannot happen in parallel, we opted for allocating
-          // the workspace with the highest requirement of memory, and reuse them for both steps.
-          const SizeType max_size = std::max(k, m_el_lc);
-          for (std::size_t i = 0; i < nthreads; ++i)
-            ws_cols.emplace_back(max_size);
-          ws_cols.emplace_back(m_el_lc);
-
-          ws_row = memory::MemoryView<T, Device::CPU>(n_el_lc);
-          std::fill_n(ws_row(), n_el_lc, 0);
-        }
-
-        // Note: we have to wait that LAED4 workspaces are ready to be used
-        barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-        const T* d_ptr = d_tiles[0].get().ptr();
-        const T* z_ptr = z_tiles[0].ptr();
-
-        // STEP 1: LAED4 (multi-thread)
-        if (thread_idx == 0) {  // TODO make it multi-threaded over multiple workers
-          common::internal::SingleThreadedBlasScope single;
-
-          T* eval_ptr = eval_tiles[0].ptr();
-          T* delta_ptr = ws_cols[thread_idx]();
-
-          for (SizeType j_el_lc = 0; j_el_lc < k_lc; ++j_el_lc) {
-            const SizeType j_el = dist_sub.global_element_from_local_element<Coord::Col>(j_el_lc);
-            const SizeType j_lc = dist_sub.local_tile_from_local_element<Coord::Col>(j_el_lc);
-
-            // Solve the deflated rank-1 problem
-            // Note:
-            // Input eigenvalues are stored "deflated" with i3, but laed4 is going to store them
-            // "locally" deflated, i.e. locally it is valid sort(non-deflated)|sort(deflated)
-            const SizeType js_el = i6[j_el];
-            T& eigenval = eval_ptr[to_sizet(j_el)];
-            lapack::laed4(to_signed<int64_t>(k), to_signed<int64_t>(js_el), d_ptr, z_ptr, delta_ptr, rho,
-                          &eigenval);
-
-            // Now laed4 result has to be copied in the right spot
-            const SizeType j_el_tl = dist_sub.tile_element_from_global_element<Coord::Col>(j_el);
-
-            for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) {
-              const SizeType i_el = dist_sub.global_element_from_local_element<Coord::Row>(i_el_lc);
-              const SizeType is_el = i4[i_el];
-
-              // just non-deflated, because deflated have been already set to 0
-              if (is_el < k) {
-                const SizeType i_lc = dist_sub.local_tile_from_local_element<Coord::Row>(i_el_lc);
-                const SizeType linear_lc = dist_extra::local_tile_linear_index(dist_sub, {i_lc, j_lc});
-                const auto& evec = evec_tiles[to_sizet(linear_lc)];
-                const SizeType i_el_tl = dist_sub.tile_element_from_local_element<Coord::Row>(i_el_lc);
-                evec({i_el_tl, j_el_tl}) = delta_ptr[is_el];
-              }
-            }
-          }
-        }
-        // Note: This barrier ensures that LAED4 finished, so from now on values are available
-        barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-        // STEP 2: Broadcast evals
-
-        // Note: this ensures that evals broadcasting finishes before bulk releases resources
-        ScopedSenderWait bcast_barrier;
-        if (thread_idx == 0)
-          bcast_barrier.sender_ = bcast_evals(row_comm_chain, eval_tiles);
-
-        // Note: laed4 handles k <= 2 cases differently
-        if (k <= 2)
-          return;
-
-        // STEP 2 Compute weights (multi-thread)
-        auto& q = evec_tiles;
-        T* w = ws_cols[thread_idx]();
-
-        // STEP 2a: copy diagonal from q -> w (or just initialize with 1)
-        if (thread_idx == 0) {
-          for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) {
-            const SizeType i_el = dist_sub.global_element_from_local_element<Coord::Row>(i_el_lc);
-            const SizeType is_el = i4[i_el];
-
-            if (is_el >= k) {
-              w[i_el_lc] = T{0};
-              continue;
-            }
-
-            const SizeType js_el = is_el;
-            const SizeType j_el = i2[js_el];
-
-            const GlobalElementIndex ij_subm_el(i_el, j_el);
-
-            if (dist_sub.rank_index().col() == dist_sub.rank_global_element<Coord::Col>(j_el)) {
-              const SizeType linear_subm_lc = dist_extra::local_tile_linear_index(
-                  dist_sub, {dist_sub.local_tile_from_local_element<Coord::Row>(i_el_lc),
-                             dist_sub.local_tile_from_global_element<Coord::Col>(j_el)});
-              const TileElementIndex ij_tl = dist_sub.tile_element_index(ij_subm_el);
-              w[i_el_lc] = q[to_sizet(linear_subm_lc)](ij_tl);
-            }
-            else {
-              w[i_el_lc] = T{1};
-            }
-          }
-        }
-        else {  // other workers
-          std::fill_n(w, m_el_lc, T(1));
-        }
-
-        barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-        // STEP 2b: compute weights
-        if (thread_idx == 0) {  // TODO make it multithreaded again
-          for (SizeType j_el_lc = 0; j_el_lc < k_lc; ++j_el_lc) {
-            const SizeType j_el = dist_sub.global_element_from_local_element<Coord::Col>(j_el_lc);
-            const SizeType js_el = i6[j_el];
-
-            for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) {
-              const SizeType i_el = dist_sub.global_element_from_local_element<Coord::Row>(i_el_lc);
-              const SizeType is_el = i4[i_el];
-
-              // skip if deflated
-              if (is_el >= k)
-                continue;
-
-              // skip if originally it was on the diagonal
-              if (is_el == js_el)
-                continue;
-
-              const SizeType linear_lc = dist_extra::local_tile_linear_index(
-                  dist_sub, {dist_sub.local_tile_from_local_element<Coord::Row>(i_el_lc),
-                             dist_sub.local_tile_from_local_element<Coord::Col>(j_el_lc)});
-              const TileElementIndex ij_tl = dist_sub.tile_element_index({i_el, j_el});
-
-              w[i_el_lc] *=
-                  q[to_sizet(linear_lc)](ij_tl) / (d_ptr[to_sizet(is_el)] - d_ptr[to_sizet(js_el)]);
-            }
-          }
-        }
-
-        barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-        // STEP 2c: reduce, then finalize computation with sign and square root (single-thread)
-        if (thread_idx == 0) {
-          // local reduction from all bulk workers
-          for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) {
-            for (std::size_t tidx = 1; tidx < nthreads; ++tidx) {
-              const T* w_partial = ws_cols[tidx]();
-              w[i_el_lc] *= w_partial[i_el_lc];
-            }
-          }
-
-          tt::sync_wait(ex::when_all(row_comm_chain(),
-                                     ex::just(MPI_PROD, common::make_data(w, m_el_lc))) |
-                        transformMPI(all_reduce_in_place));
+      ex::transfer(hp_scheduler) |
+      ex::let_value([n, dist_sub, bcast_evals, all_reduce_in_place, hp_scheduler](
+                        auto& row_comm_wrapper, auto& col_comm_wrapper, const SizeType k,
+                        const SizeType k_lc, const auto& rho, const auto& d_tiles, auto& z_tiles,
+                        const auto& eval_tiles, const auto& i4_tiles_arr, const auto& i6_tiles_arr,
+                        const auto& i2_tiles_arr, const auto& evec_tiles, auto& ws_cols, auto& ws_row) {
+        using pika::execution::thread_priority;
+
+        const std::size_t nthreads = [dist_sub, k_lc] {
+          const std::size_t workload = to_sizet(dist_sub.localSize().rows() * k_lc);
+          const std::size_t workload_unit = 2 * to_sizet(dist_sub.tile_size().linear_size());
+
+          const std::size_t min_workers = 1;
+          const std::size_t available_workers = getTridiagRank1NWorkers();
+
+          const std::size_t ideal_workers = util::ceilDiv(to_sizet(workload), workload_unit);
+          return std::clamp(ideal_workers, min_workers, available_workers);
+        }();
+
+        return ex::just(std::make_unique<pika::barrier<>>(nthreads)) | ex::transfer(hp_scheduler) |
+               ex::bulk(nthreads, [&row_comm_wrapper, &col_comm_wrapper, k, k_lc, &rho, &d_tiles,
+                                   &z_tiles, &eval_tiles, &i4_tiles_arr, &i6_tiles_arr, &i2_tiles_arr,
+                                   &evec_tiles, &ws_cols, &ws_row, nthreads, n, dist_sub, bcast_evals,
+                                   all_reduce_in_place](const std::size_t thread_idx,
+                                                        auto& barrier_ptr) {
+                 using dlaf::comm::internal::transformMPI;
+
+                 common::Pipeline<comm::Communicator> row_comm_chain(row_comm_wrapper.get());
+                 const dlaf::comm::Communicator& col_comm = col_comm_wrapper.get();
+
+                 const SizeType m_lc = dist_sub.local_nr_tiles().rows();
+                 const SizeType m_el_lc = dist_sub.local_size().rows();
+                 const SizeType n_el_lc = dist_sub.local_size().cols();
+
+                 const auto barrier_busy_wait = getTridiagRank1BarrierBusyWait();
+
+                 const SizeType* i4 = i4_tiles_arr[0].get().ptr();
+                 const SizeType* i2 = i2_tiles_arr[0].get().ptr();
+                 const SizeType* i6 = i6_tiles_arr[0].get().ptr();
+
+                 // STEP 0a: Fill ones for deflated Eigenvectors and copy related Eigenvalues (single-thread)
+                 // Note: this step is completely independent from the rest, but it is small and it is going
+                 // to be dropped soon.
+                 // Note: use last threads that in principle should have less work to do
+                 if (k < n && thread_idx == nthreads - 1) {
+                   const T* eval_initial_ptr = d_tiles[0].get().ptr();
+                   T* eval_ptr = eval_tiles[0].ptr();
+
+                   for (SizeType j_el_lc = k_lc; j_el_lc < n_el_lc; ++j_el_lc) {
+                     const SizeType j_el =
+                         dist_sub.global_element_from_local_element<Coord::Col>(j_el_lc);
+                     const SizeType i_el = j_el;
+
+                     if (dist_sub.rank_index().row() == dist_sub.rank_global_element<Coord::Row>(i_el)) {
+                       const SizeType i_el_lc =
+                           dist_sub.local_element_from_global_element<Coord::Row>(i_el);
+                       const LocalTileIndex
+                           i_lc{dist_sub.local_tile_from_local_element<Coord::Row>(i_el_lc),
+                                dist_sub.local_tile_from_local_element<Coord::Col>(j_el_lc)};
+                       const SizeType linear_lc = dist_extra::local_tile_linear_index(dist_sub, i_lc);
+                       const TileElementIndex
+                           ij_el_tl{dist_sub.tile_element_from_local_element<Coord::Row>(i_el_lc),
+                                    dist_sub.tile_element_from_local_element<Coord::Col>(j_el_lc)};
+
+                       evec_tiles[to_sizet(linear_lc)](ij_el_tl) = T{1};
+                     }
+
+                     eval_ptr[j_el] = eval_initial_ptr[i6[j_el]];
+                   }
+                 }
+
+                 const std::size_t batch_size = util::ceilDiv(to_sizet(k_lc), nthreads);
+                 const SizeType begin = to_SizeType(thread_idx * batch_size);
+                 const SizeType end = std::min(to_SizeType(thread_idx * batch_size + batch_size), k_lc);
+
+                 // STEP 0b: Initialize workspaces (single-thread)
+                 if (thread_idx == 0) {
+                   // Note:
+                   // - nthreads are used for both LAED4 and weight calculation (one per worker thread)
+                   // - last one is used for reducing weights from all workers
+                   ws_cols.reserve(nthreads + 1);
+
+                   // Note:
+                   // Considering that
+                   // - LAED4 requires working on k elements
+                   // - Weight computation requires working on m_el_lc
+                   //
+                   // and they are needed at two steps that cannot happen in parallel, we opted for allocating
+                   // the workspace with the highest requirement of memory, and reuse them for both steps.
+                   const SizeType max_size = std::max(k, m_el_lc);
+                   for (std::size_t i = 0; i < nthreads; ++i)
+                     ws_cols.emplace_back(max_size);
+                   ws_cols.emplace_back(m_el_lc);
+
+                   ws_row = memory::MemoryView<T, Device::CPU>(n_el_lc);
+                   std::fill_n(ws_row(), n_el_lc, 0);
+                 }
+
+                 // Note: we have to wait that LAED4 workspaces are ready to be used
+                 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+                 const T* d_ptr = d_tiles[0].get().ptr();
+                 const T* z_ptr = z_tiles[0].ptr();
+
+                 // STEP 1: LAED4 (multi-thread)
+                 {
+                   common::internal::SingleThreadedBlasScope single;
+
+                   T* eval_ptr = eval_tiles[0].ptr();
+                   T* delta_ptr = ws_cols[thread_idx]();
+
+                   for (SizeType j_el_lc = begin; j_el_lc < end; ++j_el_lc) {
+                     const SizeType j_el =
+                         dist_sub.global_element_from_local_element<Coord::Col>(j_el_lc);
+                     const SizeType j_lc = dist_sub.local_tile_from_local_element<Coord::Col>(j_el_lc);
+
+                     // Solve the deflated rank-1 problem
+                     // Note:
+                     // Input eigenvalues are stored "deflated" with i3, but laed4 is going to store them
+                     // "locally" deflated, i.e. locally it is valid sort(non-deflated)|sort(deflated)
+                     const SizeType js_el = i6[j_el];
+                     T& eigenval = eval_ptr[to_sizet(j_el)];
+                     lapack::laed4(to_signed<int64_t>(k), to_signed<int64_t>(js_el), d_ptr, z_ptr,
+                                   delta_ptr, rho, &eigenval);
+
+                     // Now laed4 result has to be copied in the right spot
+                     const SizeType j_el_tl =
+                         dist_sub.tile_element_from_global_element<Coord::Col>(j_el);
+
+                     for (SizeType i_lc = 0; i_lc < m_lc; ++i_lc) {
+                       const SizeType i = dist_sub.global_tile_from_local_tile<Coord::Row>(i_lc);
+                       const SizeType m_el_tl = dist_sub.tile_size_of<Coord::Row>(i);
+                       const SizeType linear_lc =
+                           dist_extra::local_tile_linear_index(dist_sub, {i_lc, j_lc});
+                       const auto& evec = evec_tiles[to_sizet(linear_lc)];
+                       for (SizeType i_el_tl = 0; i_el_tl < m_el_tl; ++i_el_tl) {
+                         const SizeType i_el =
+                             dist_sub.global_element_from_local_tile_and_tile_element<Coord::Row>(
+                                 i_lc, i_el_tl);
+                         DLAF_ASSERT_HEAVY(i_el < n, i_el, n);
+                         const SizeType is_el = i4[i_el];
+
+                         // just non-deflated, because deflated have been already set to 0
+                         if (is_el < k)
+                           evec({i_el_tl, j_el_tl}) = delta_ptr[is_el];
+                       }
+                     }
+                   }
+                 }
+                 // Note: This barrier ensures that LAED4 finished, so from now on values are available
+                 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+                 // STEP 2: Broadcast evals
+
+                 // Note: this ensures that evals broadcasting finishes before bulk releases resources
+                 ScopedSenderWait bcast_barrier;
+                 if (thread_idx == 0)
+                   bcast_barrier.sender_ = bcast_evals(row_comm_chain, eval_tiles);
+
+                 // Note: laed4 handles k <= 2 cases differently
+                 if (k <= 2)
+                   return;
+
+                 // STEP 2 Compute weights (multi-thread)
+                 auto& q = evec_tiles;
+                 T* w = ws_cols[thread_idx]();
+
+                 // STEP 2a: copy diagonal from q -> w (or just initialize with 1)
+                 if (thread_idx == 0) {
+                   for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) {
+                     const SizeType i_el =
+                         dist_sub.global_element_from_local_element<Coord::Row>(i_el_lc);
+                     const SizeType is_el = i4[i_el];
+
+                     if (is_el >= k) {
+                       w[i_el_lc] = T{0};
+                       continue;
+                     }
+
+                     const SizeType js_el = is_el;
+                     const SizeType j_el = i2[js_el];
+
+                     const GlobalElementIndex ij_subm_el(i_el, j_el);
+
+                     if (dist_sub.rank_index().col() == dist_sub.rank_global_element<Coord::Col>(j_el)) {
+                       const SizeType linear_subm_lc = dist_extra::local_tile_linear_index(
+                           dist_sub, {dist_sub.local_tile_from_local_element<Coord::Row>(i_el_lc),
+                                      dist_sub.local_tile_from_global_element<Coord::Col>(j_el)});
+                       const TileElementIndex ij_tl = dist_sub.tile_element_index(ij_subm_el);
+                       w[i_el_lc] = q[to_sizet(linear_subm_lc)](ij_tl);
+                     }
+                     else {
+                       w[i_el_lc] = T{1};
+                     }
+                   }
+                 }
+                 else {  // other workers
+                   std::fill_n(w, m_el_lc, T(1));
+                 }
+
+                 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+                 // STEP 2b: compute weights
+                 {
+                   for (SizeType j_el_lc = begin; j_el_lc < end; ++j_el_lc) {
+                     const SizeType j_el =
+                         dist_sub.global_element_from_local_element<Coord::Col>(j_el_lc);
+                     const SizeType j_lc = dist_sub.local_tile_from_global_element<Coord::Col>(j_el);
+                     const SizeType js_el = i6[j_el];
+                     const T delta_j = d_ptr[to_sizet(js_el)];
+
+                     const SizeType j_el_tl =
+                         dist_sub.tile_element_from_local_element<Coord::Col>(j_el_lc);
+
+                     for (SizeType i_lc = 0; i_lc < m_lc; ++i_lc) {
+                       const SizeType i = dist_sub.global_tile_from_local_tile<Coord::Row>(i_lc);
+                       const SizeType m_el_tl = dist_sub.tile_size_of<Coord::Row>(i);
+                       const SizeType linear_lc =
+                           dist_extra::local_tile_linear_index(dist_sub, {i_lc, j_lc});
+                       const auto& q_tile = q[to_sizet(linear_lc)];
+
+                       for (SizeType i_el_tl = 0; i_el_tl < m_el_tl; ++i_el_tl) {
+                         const SizeType i_el =
+                             dist_sub.global_element_from_global_tile_and_tile_element<Coord::Row>(
+                                 i, i_el_tl);
+                         DLAF_ASSERT_HEAVY(i_el < n, i_el, n);
+                         const SizeType is_el = i4[i_el];
+
+                         // skip if deflated
+                         if (is_el >= k)
+                           continue;
+
+                         // skip if originally it was on the diagonal
+                         if (is_el == js_el)
+                           continue;
+
+                         const SizeType i_el_lc =
+                             dist_sub.local_element_from_local_tile_and_tile_element<Coord::Row>(
+                                 i_lc, i_el_tl);
+                         const TileElementIndex ij_tl(i_el_tl, j_el_tl);
+
+                         w[i_el_lc] *= q_tile(ij_tl) / (d_ptr[to_sizet(is_el)] - delta_j);
+                       }
+                     }
+                   }
+                 }
+
+                 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+                 // STEP 2c: reduce, then finalize computation with sign and square root (single-thread)
+                 if (thread_idx == 0) {
+                   // local reduction from all bulk workers
+                   for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) {
+                     for (std::size_t tidx = 1; tidx < nthreads; ++tidx) {
+                       const T* w_partial = ws_cols[tidx]();
+                       w[i_el_lc] *= w_partial[i_el_lc];
+                     }
+                   }
+
+                   tt::sync_wait(ex::when_all(row_comm_chain(),
+                                              ex::just(MPI_PROD, common::make_data(w, m_el_lc))) |
+                                 transformMPI(all_reduce_in_place));
 
 #ifdef DLAF_ASSERT_HEAVY_ENABLE
-          // Note: all input for weights computation of non-deflated rows should be strictly less than 0
-          for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) {
-            const SizeType i_el = dist_sub.global_element_from_local_element<Coord::Row>(i_el_lc);
-            const SizeType is = i4[i_el];
-            if (is < k)
-              DLAF_ASSERT_HEAVY(w[i_el_lc] < 0, w[i_el_lc]);
-          }
+                   // Note: all input for weights computation of non-deflated rows should be strictly less than 0
+                   for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) {
+                     const SizeType i_el =
+                         dist_sub.global_element_from_local_element<Coord::Row>(i_el_lc);
+                     const SizeType is = i4[i_el];
+                     if (is < k)
+                       DLAF_ASSERT_HEAVY(w[i_el_lc] < 0, w[i_el_lc]);
+                   }
 #endif
 
-          T* weights = ws_cols[nthreads]();
-          for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) {
-            const SizeType i_el = dist_sub.global_element_from_local_element<Coord::Row>(i_el_lc);
-            const SizeType is_el = i4[i_el];
-            weights[to_sizet(i_el_lc)] = std::copysign(std::sqrt(-w[i_el_lc]), z_ptr[to_sizet(is_el)]);
-          }
-        }
-
-        barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-        // STEP 3: Compute eigenvectors of the modified rank-1 modification (normalize) (multi-thread)
-
-        // STEP 3a: Form evecs using weights vector and compute (local) sum of squares
-        if (thread_idx == 0) {  // TODO make it multithreaded again
-          common::internal::SingleThreadedBlasScope single;
-
-          const T* w = ws_cols[nthreads]();
-          T* sum_squares = ws_row();
-
-          for (SizeType j_el_lc = 0; j_el_lc < k_lc; ++j_el_lc) {
-            const SizeType j_lc = dist_sub.local_tile_from_local_element<Coord::Col>(j_el_lc);
-            const SizeType j_el_tl = dist_sub.tile_element_from_local_element<Coord::Col>(j_el_lc);
-
-            for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) {
-              const SizeType i_el = dist_sub.global_element_from_local_element<Coord::Row>(i_el_lc);
-              const SizeType is_el = i4[i_el];
-
-              // it is a deflated row, skip it (it should be already 0)
-              if (is_el >= k)
-                continue;
-
-              const LocalTileIndex ij_lc(dist_sub.local_tile_from_local_element<Coord::Row>(i_el_lc),
-                                         j_lc);
-              const SizeType ij_linear = dist_extra::local_tile_linear_index(dist_sub, ij_lc);
-              const TileElementIndex ij_el_tl(
-                  dist_sub.tile_element_from_local_element<Coord::Row>(i_el_lc), j_el_tl);
-
-              const auto& q_tile = q[to_sizet(ij_linear)];
-
-              q_tile(ij_el_tl) = w[i_el_lc] / q_tile(ij_el_tl);
-            }
-
-            // column-major once the full column has been updated, compute the sum of squares (for norm)
-            for (SizeType i_lc = 0; i_lc < dist_sub.local_nr_tiles().rows(); ++i_lc) {
-              const LocalTileIndex ij_lc(i_lc, j_lc);
-              const SizeType ij_linear = dist_extra::local_tile_linear_index(dist_sub, ij_lc);
-              const T* partial_evec = q[to_sizet(ij_linear)].ptr({0, j_el_tl});
-              const SizeType i = dist_sub.global_tile_from_local_tile<Coord::Row>(i_lc);
-              const SizeType m_el_tl = dist_sub.tile_size_of<Coord::Row>(i);
-              sum_squares[j_el_lc] += blas::dot(m_el_tl, partial_evec, 1, partial_evec, 1);
-            }
-          }
-        }
-
-        barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-        // STEP 3b: Reduce to get the sum of all squares on all ranks
-        if (thread_idx == 0)
-          tt::sync_wait(ex::just(std::cref(col_comm), MPI_SUM, common::make_data(ws_row(), k_lc)) |
-                        transformMPI(all_reduce_in_place));
-
-        barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-        // STEP 3c: Normalize (compute norm of each column and scale column vector)
-        if (thread_idx == 0) {  // TODO make it multithreaded again
-          common::internal::SingleThreadedBlasScope single;
-
-          const T* sum_squares = ws_row();
-
-          for (SizeType j_el_lc = 0; j_el_lc < k_lc; ++j_el_lc) {
-            const SizeType j_lc = dist_sub.local_tile_from_local_element<Coord::Col>(j_el_lc);
-            const SizeType j_el_tl = dist_sub.tile_element_from_local_element<Coord::Col>(j_el_lc);
-
-            const T vec_norm = std::sqrt(sum_squares[j_el_lc]);
-
-            for (SizeType i_lc = 0; i_lc < dist_sub.local_nr_tiles().rows(); ++i_lc) {
-              const LocalTileIndex ij_lc(i_lc, j_lc);
-              const SizeType ij_linear = dist_extra::local_tile_linear_index(dist_sub, ij_lc);
-
-              T* partial_evec = q[to_sizet(ij_linear)].ptr({0, j_el_tl});
-
-              const SizeType i = dist_sub.global_tile_from_local_tile<Coord::Row>(i_lc);
-              const SizeType m_el_tl = dist_sub.tile_size_of<Coord::Row>(i);
-              blas::scal(m_el_tl, 1 / vec_norm, partial_evec, 1);
-            }
-          }
-        }
+                   T* weights = ws_cols[nthreads]();
+                   for (SizeType i_el_lc = 0; i_el_lc < m_el_lc; ++i_el_lc) {
+                     const SizeType i_el =
+                         dist_sub.global_element_from_local_element<Coord::Row>(i_el_lc);
+                     const SizeType is_el = i4[i_el];
+                     weights[to_sizet(i_el_lc)] =
+                         std::copysign(std::sqrt(-w[i_el_lc]), z_ptr[to_sizet(is_el)]);
+                   }
+                 }
+
+                 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+                 // STEP 3: Compute eigenvectors of the modified rank-1 modification (normalize) (multi-thread)
+
+                 // STEP 3a: Form evecs using weights vector and compute (local) sum of squares
+                 {
+                   common::internal::SingleThreadedBlasScope single;
+
+                   const T* w = ws_cols[nthreads]();
+                   T* sum_squares = ws_row();
+
+                   for (SizeType j_el_lc = begin; j_el_lc < end; ++j_el_lc) {
+                     const SizeType j_lc = dist_sub.local_tile_from_local_element<Coord::Col>(j_el_lc);
+                     const SizeType j_el_tl =
+                         dist_sub.tile_element_from_local_element<Coord::Col>(j_el_lc);
+
+                     for (SizeType i_lc = 0; i_lc < m_lc; ++i_lc) {
+                       const SizeType i = dist_sub.global_tile_from_local_tile<Coord::Row>(i_lc);
+                       const SizeType m_el_tl = dist_sub.tile_size_of<Coord::Row>(i);
+                       const SizeType linear_lc =
+                           dist_extra::local_tile_linear_index(dist_sub, {i_lc, j_lc});
+                       const auto& q_tile = q[to_sizet(linear_lc)];
+
+                       for (SizeType i_el_tl = 0; i_el_tl < m_el_tl; ++i_el_tl) {
+                         const SizeType i_el =
+                             dist_sub.global_element_from_global_tile_and_tile_element<Coord::Row>(
+                                 i, i_el_tl);
+
+                         DLAF_ASSERT_HEAVY(i_el < n, i_el, n);
+                         const SizeType is_el = i4[i_el];
+
+                         // it is a deflated row, skip it (it should be already 0)
+                         if (is_el >= k)
+                           continue;
+
+                         const SizeType i_el_lc =
+                             dist_sub.local_element_from_local_tile_and_tile_element<Coord::Row>(
+                                 i_lc, i_el_tl);
+                         const TileElementIndex ij_el_tl(i_el_tl, j_el_tl);
+
+                         q_tile(ij_el_tl) = w[i_el_lc] / q_tile(ij_el_tl);
+                       }
+
+                       const T* partial_evec = q_tile.ptr({0, j_el_tl});
+                       sum_squares[j_el_lc] += blas::dot(m_el_tl, partial_evec, 1, partial_evec, 1);
+                     }
+                   }
+                 }
+
+                 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+                 // STEP 3b: Reduce to get the sum of all squares on all ranks
+                 if (thread_idx == 0)
+                   tt::sync_wait(ex::just(std::cref(col_comm), MPI_SUM,
+                                          common::make_data(ws_row(), k_lc)) |
+                                 transformMPI(all_reduce_in_place));
+
+                 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+                 // STEP 3c: Normalize (compute norm of each column and scale column vector)
+                 {
+                   common::internal::SingleThreadedBlasScope single;
+
+                   const T* sum_squares = ws_row();
+
+                   for (SizeType j_el_lc = begin; j_el_lc < end; ++j_el_lc) {
+                     const SizeType j_lc = dist_sub.local_tile_from_local_element<Coord::Col>(j_el_lc);
+                     const SizeType j_el_tl =
+                         dist_sub.tile_element_from_local_element<Coord::Col>(j_el_lc);
+
+                     const T vec_norm = std::sqrt(sum_squares[j_el_lc]);
+
+                     for (SizeType i_lc = 0; i_lc < m_lc; ++i_lc) {
+                       const LocalTileIndex ij_lc(i_lc, j_lc);
+                       const SizeType ij_linear = dist_extra::local_tile_linear_index(dist_sub, ij_lc);
+
+                       T* partial_evec = q[to_sizet(ij_linear)].ptr({0, j_el_tl});
+
+                       const SizeType i = dist_sub.global_tile_from_local_tile<Coord::Row>(i_lc);
+                       const SizeType m_el_tl = dist_sub.tile_size_of<Coord::Row>(i);
+                       blas::scal(m_el_tl, 1 / vec_norm, partial_evec, 1);
+                     }
+                   }
+                 }
+               });
       }));
 }
 

From 72b5210b21447bd4b61b19a54a4b3f6bf0f0ecef Mon Sep 17 00:00:00 2001
From: guglielmogagliardi <94535690+gulivarese@users.noreply.github.com>
Date: Tue, 12 Dec 2023 13:45:52 +0100
Subject: [PATCH 2/7] add miniapp triangular multiplication (#1029)

---
 miniapp/CMakeLists.txt                        |   2 +
 miniapp/miniapp_triangular_multiplication.cpp | 288 ++++++++++++++++++
 miniapp/miniapp_triangular_solver.cpp         |   1 +
 scripts/miniapps.py                           |  35 +++
 scripts/plot_trmm_strong.py                   |  20 ++
 scripts/plot_trmm_weak.py                     |  20 ++
 scripts/postprocess.py                        |  78 ++++-
 7 files changed, 441 insertions(+), 3 deletions(-)
 create mode 100644 miniapp/miniapp_triangular_multiplication.cpp
 create mode 100755 scripts/plot_trmm_strong.py
 create mode 100755 scripts/plot_trmm_weak.py

diff --git a/miniapp/CMakeLists.txt b/miniapp/CMakeLists.txt
index 9d5b054928..8f3c145f15 100644
--- a/miniapp/CMakeLists.txt
+++ b/miniapp/CMakeLists.txt
@@ -39,6 +39,8 @@ DLAF_addMiniapp(miniapp_bt_reduction_to_band SOURCES miniapp_bt_reduction_to_ban
 
 DLAF_addMiniapp(miniapp_triangular_solver SOURCES miniapp_triangular_solver.cpp)
 
+DLAF_addMiniapp(miniapp_triangular_multiplication SOURCES miniapp_triangular_multiplication.cpp)
+
 DLAF_addMiniapp(miniapp_eigensolver SOURCES miniapp_eigensolver.cpp)
 
 DLAF_addMiniapp(miniapp_gen_eigensolver SOURCES miniapp_gen_eigensolver.cpp)
diff --git a/miniapp/miniapp_triangular_multiplication.cpp b/miniapp/miniapp_triangular_multiplication.cpp
new file mode 100644
index 0000000000..4cd0e52692
--- /dev/null
+++ b/miniapp/miniapp_triangular_multiplication.cpp
@@ -0,0 +1,288 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2018-2023, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <type_traits>
+
+#include <blas/util.hh>
+#include <mpi.h>
+
+#include <pika/init.hpp>
+#include <pika/program_options.hpp>
+#include <pika/runtime.hpp>
+
+#include <dlaf/common/format_short.h>
+#include <dlaf/common/index2d.h>
+#include <dlaf/common/range2d.h>
+#include <dlaf/common/timer.h>
+#include <dlaf/communication/communicator_grid.h>
+#include <dlaf/communication/init.h>
+#include <dlaf/init.h>
+#include <dlaf/matrix/copy.h>
+#include <dlaf/matrix/index.h>
+#include <dlaf/matrix/matrix_mirror.h>
+#include <dlaf/miniapp/dispatch.h>
+#include <dlaf/miniapp/options.h>
+#include <dlaf/multiplication/triangular.h>
+#include <dlaf/types.h>
+#include <dlaf/util_matrix.h>
+
+namespace {
+
+using dlaf::Backend;
+using dlaf::DefaultDevice_v;
+using dlaf::Device;
+using dlaf::GlobalElementIndex;
+using dlaf::GlobalElementSize;
+using dlaf::SizeType;
+using dlaf::TileElementSize;
+using dlaf::comm::Communicator;
+using dlaf::comm::CommunicatorGrid;
+using dlaf::common::Ordering;
+
+struct Options
+    : dlaf::miniapp::MiniappOptions<dlaf::miniapp::SupportReal::Yes, dlaf::miniapp::SupportComplex::No> {
+  SizeType m;
+  SizeType n;
+  SizeType mb;
+  SizeType nb;
+  blas::Side side;
+  blas::Uplo uplo;
+  blas::Op op;
+  blas::Diag diag;
+
+  Options(const pika::program_options::variables_map& vm)
+      : MiniappOptions(vm), m(vm["m"].as<SizeType>()), n(vm["n"].as<SizeType>()),
+        mb(vm["mb"].as<SizeType>()), nb(vm["nb"].as<SizeType>()),
+        side(dlaf::miniapp::parseSide(vm["side"].as<std::string>())),
+        uplo(dlaf::miniapp::parseUplo(vm["uplo"].as<std::string>())),
+        op(dlaf::miniapp::parseOp(vm["op"].as<std::string>())),
+        diag(dlaf::miniapp::parseDiag(vm["diag"].as<std::string>())) {
+    DLAF_ASSERT(m > 0 && n > 0, m, n);
+    DLAF_ASSERT(mb > 0 && nb > 0, mb, nb);
+
+    if (do_check != dlaf::miniapp::CheckIterFreq::None) {
+      std::cerr << "Warning! At the moment result checking it is not implemented." << std::endl;
+      do_check = dlaf::miniapp::CheckIterFreq::None;
+    }
+  }
+
+  Options(Options&&) = default;
+  Options(const Options&) = default;
+  Options& operator=(Options&&) = default;
+  Options& operator=(const Options&) = default;
+};
+
+template <typename T>
+using matrix_values_t = std::function<T(const GlobalElementIndex&)>;
+
+template <typename T>
+using linear_system_t =
+    std::tuple<matrix_values_t<T>, matrix_values_t<T>, matrix_values_t<T>>;  // A, B, X
+
+template <typename T>
+linear_system_t<T> sampleLeftTr(blas::Uplo uplo, blas::Op op, blas::Diag diag, T alpha, SizeType m);
+}
+
+struct triangularMultiplicationMiniapp {
+  template <dlaf::Backend backend, typename T>
+  static void run(const Options& opts) {
+    Communicator world(MPI_COMM_WORLD);
+    CommunicatorGrid comm_grid(world, opts.grid_rows, opts.grid_cols, Ordering::ColumnMajor);
+
+    // Allocate memory for the matrices
+    dlaf::matrix::Matrix<T, Device::CPU> ah(GlobalElementSize{opts.m, opts.m},
+                                            TileElementSize{opts.mb, opts.mb}, comm_grid);
+    dlaf::matrix::Matrix<T, Device::CPU> bh(GlobalElementSize{opts.m, opts.n},
+                                            TileElementSize{opts.mb, opts.nb}, comm_grid);
+
+    dlaf::matrix::MatrixMirror<T, DefaultDevice_v<backend>, Device::CPU> a(ah);
+    dlaf::matrix::MatrixMirror<T, DefaultDevice_v<backend>, Device::CPU> b(bh);
+
+    auto sync_barrier = [&]() {
+      a.get().waitLocalTiles();
+      b.get().waitLocalTiles();
+      DLAF_MPI_CHECK_ERROR(MPI_Barrier(world));
+    };
+
+    const auto side = opts.side;
+    DLAF_ASSERT(side == blas::Side::Left, side);
+
+    const auto uplo = opts.uplo;
+    const auto op = opts.op;
+    const auto diag = opts.diag;
+    const T alpha = 2.0;
+
+    double m = ah.size().rows();
+    double n = bh.size().cols();
+    auto add_mul = n * m * m / 2;
+    const double total_ops = dlaf::total_ops<T>(add_mul, add_mul);
+
+    auto [in_op_a, out_b, in_b] = ::sampleLeftTr(uplo, op, diag, alpha, ah.size().rows());
+
+    for (int64_t run_index = -opts.nwarmups; run_index < opts.nruns; ++run_index) {
+      if (0 == world.rank() && run_index >= 0)
+        std::cout << "[" << run_index << "]" << std::endl;
+
+      // setup matrix A and b
+      using dlaf::matrix::util::set;
+      set(ah, in_op_a, op);
+      set(bh, in_b);
+      a.copySourceToTarget();
+      b.copySourceToTarget();
+
+      sync_barrier();
+
+      dlaf::common::Timer<> timeit;
+      if (opts.local)
+        dlaf::triangular_multiplication<backend, dlaf::DefaultDevice_v<backend>, T>(side, uplo, op, diag,
+                                                                                    alpha, a.get(),
+                                                                                    b.get());
+      else
+        dlaf::triangular_multiplication<backend, dlaf::DefaultDevice_v<backend>, T>(
+            comm_grid, side, uplo, op, diag, alpha, a.get(), b.get());
+
+      sync_barrier();
+
+      // benchmark results
+      if (0 == world.rank() && run_index >= 0) {
+        auto elapsed_time = timeit.elapsed();
+        double gigaflops = total_ops / elapsed_time / 1e9;
+
+        std::cout << "[" << run_index << "]"
+                  << " " << elapsed_time << "s"
+                  << " " << gigaflops << "GFlop/s"
+                  << " " << dlaf::internal::FormatShort{opts.type}
+                  << dlaf::internal::FormatShort{opts.side} << dlaf::internal::FormatShort{opts.uplo}
+                  << dlaf::internal::FormatShort{opts.op} << dlaf::internal::FormatShort{opts.diag}
+                  << " " << bh.size() << " " << bh.blockSize() << " " << comm_grid.size() << " "
+                  << pika::get_os_thread_count() << " " << backend << std::endl;
+      }
+
+      b.copyTargetToSource();
+
+      // (optional) run test
+      if ((opts.do_check == dlaf::miniapp::CheckIterFreq::Last && run_index == (opts.nruns - 1)) ||
+          opts.do_check == dlaf::miniapp::CheckIterFreq::All) {
+        DLAF_UNIMPLEMENTED("Check");
+      }
+    }
+  }
+};
+
+int pika_main(pika::program_options::variables_map& vm) {
+  pika::scoped_finalize pika_finalizer;
+  dlaf::ScopedInitializer init(vm);
+
+  const Options opts(vm);
+  dlaf::miniapp::dispatchMiniapp<triangularMultiplicationMiniapp>(opts);
+
+  return EXIT_SUCCESS;
+}
+
+int main(int argc, char** argv) {
+  dlaf::comm::mpi_init mpi_initter(argc, argv);
+
+  // options
+  using namespace pika::program_options;
+  options_description desc_commandline(
+      "Benchmark computation of A . B, "
+      "where A is a non-unit lower triangular matrix, and B is an m by n matrix\n\n"
+      "options\n"
+      "Usage: miniapp_triangular_multiplication [options]");
+  desc_commandline.add(dlaf::miniapp::getMiniappOptionsDescription());
+  desc_commandline.add(dlaf::getOptionsDescription());
+
+  // clang-format off
+  desc_commandline.add_options()
+    ("m",             value<SizeType>()     ->default_value(4096),       "Matrix b rows")
+    ("n",             value<SizeType>()     ->default_value(512),        "Matrix b columns")
+    ("mb",            value<SizeType>()     ->default_value(256),        "Matrix b block rows")
+    ("nb",            value<SizeType>()     ->default_value(512),        "Matrix b block columns")
+  ;
+  // clang-format on
+  dlaf::miniapp::addSideOption(desc_commandline);
+  dlaf::miniapp::addUploOption(desc_commandline);
+  dlaf::miniapp::addOpOption(desc_commandline);
+  dlaf::miniapp::addDiagOption(desc_commandline);
+
+  pika::init_params p;
+  p.desc_cmdline = desc_commandline;
+  p.rp_callback = dlaf::initResourcePartitionerHandler;
+  return pika::init(pika_main, argc, argv, p);
+}
+
+namespace {
+/// Returns a tuple of element generators of three matrices A(m x m), B (m x n), X (m x n), for which it
+/// holds op(A) X = alpha B (alpha can be any value).
+///
+/// The elements of op(A) (@p el_op_a) are chosen such that:
+///   op(A)_ik = (i+1) / (k+.5) * exp(I*(2*i-k)) for the referenced elements
+///   op(A)_ik = -9.9 otherwise,
+/// where I = 0 for real types or I is the complex unit for complex types.
+///
+/// The elements of X (@p el_x) are computed as
+///   X_kj = (k+.5) / (j+2) * exp(I*(k+j)).
+/// These data are typically used to check whether the result of the equation
+/// performed with any algorithm is consistent with the computed values.
+///
+/// Finally, the elements of B (@p el_b) should be:
+/// B_ij = (Sum_k op(A)_ik * X_kj) / alpha
+///      = (op(A)_ii * X_ij + (kk-1) * gamma) / alpha,
+/// where gamma = (i+1) / (j+2) * exp(I*(2*i+j)),
+///       kk = i+1 if op(a) is an lower triangular matrix, or
+///       kk = m-i if op(a) is an lower triangular matrix.
+/// Therefore
+/// B_ij = (X_ij + (kk-1) * gamma) / alpha, if diag == Unit
+/// B_ij = kk * gamma / alpha, otherwise.
+template <typename T>
+linear_system_t<T> sampleLeftTr(blas::Uplo uplo, blas::Op op, blas::Diag diag, T alpha, SizeType m) {
+  static_assert(std::is_arithmetic_v<T> && !std::is_integral_v<T>,
+                "it is valid just with floating point values");
+
+  bool op_a_lower = (uplo == blas::Uplo::Lower && op == blas::Op::NoTrans) ||
+                    (uplo == blas::Uplo::Upper && op != blas::Op::NoTrans);
+
+  auto el_op_a = [op_a_lower, diag](const GlobalElementIndex& index) -> T {
+    if ((op_a_lower && index.row() < index.col()) || (!op_a_lower && index.row() > index.col()) ||
+        (diag == blas::Diag::Unit && index.row() == index.col()))
+      return static_cast<T>(-9.9);
+
+    const T i = index.row();
+    const T k = index.col();
+
+    return (i + static_cast<T>(1)) / (k + static_cast<T>(.5));
+  };
+
+  auto el_x = [](const GlobalElementIndex& index) -> T {
+    const T k = index.row();
+    const T j = index.col();
+
+    return (k + static_cast<T>(.5)) / (j + static_cast<T>(2));
+  };
+
+  auto el_b = [m, alpha, diag, op_a_lower, el_x](const GlobalElementIndex& index) -> T {
+    const dlaf::BaseType<T> kk = op_a_lower ? index.row() + 1 : m - index.row();
+
+    const T i = index.row();
+    const T j = index.col();
+    const T gamma = (i + 1) / (j + 2);
+    if (diag == blas::Diag::Unit)
+      return ((kk - 1) * gamma + el_x(index)) / alpha;
+    else
+      return kk * gamma / alpha;
+  };
+
+  return {el_op_a, el_b, el_x};
+}
+
+}
diff --git a/miniapp/miniapp_triangular_solver.cpp b/miniapp/miniapp_triangular_solver.cpp
index c5427dd748..9d3478e9ed 100644
--- a/miniapp/miniapp_triangular_solver.cpp
+++ b/miniapp/miniapp_triangular_solver.cpp
@@ -115,6 +115,7 @@ struct triangularSolverMiniapp {
     };
 
     const auto side = opts.side;
+    DLAF_ASSERT(side == blas::Side::Left, side);
     const auto uplo = opts.uplo;
     const auto op = opts.op;
     const auto diag = opts.diag;
diff --git a/scripts/miniapps.py b/scripts/miniapps.py
index 8706eee9a1..cdc05a397f 100644
--- a/scripts/miniapps.py
+++ b/scripts/miniapps.py
@@ -278,6 +278,41 @@ def trsm(
     return cmd, env.strip()
 
 
+def trmm(
+    system,
+    lib,
+    build_dir,
+    nodes,
+    rpn,
+    m_sz,
+    n_sz,
+    mb_sz,
+    nruns,
+    suffix="na",
+    extra_flags="",
+    env="",
+    dtype="d",
+):
+    if n_sz == None:
+        n_sz = m_sz
+
+    _check_ranks_per_node(system, lib, rpn)
+    [total_ranks, cores_per_rank, threads_per_rank] = _computeResourcesNeededList(system, nodes, rpn)
+    gr, gc = _sq_factor(total_ranks)
+
+    if lib.startswith("dlaf"):
+        _check_type(dtype)
+        env += " OMP_NUM_THREADS=1"
+        app = f"{build_dir}/miniapp/miniapp_triangular_multiplication"
+        opts = f"--type {dtype} --m {m_sz} --n {n_sz} --mb {mb_sz} --nb {mb_sz} --grid-rows {gr} --grid-cols {gc} --nruns {nruns} {extra_flags}"
+    else:
+        raise ValueError(_err_msg(lib))
+
+    _checkAppExec(app)
+    cmd = f"{app} {opts}".strip() + f" >> trmm_{lib}_{suffix}.out 2>&1"
+    return cmd, env.strip()
+
+
 # lib: allowed libraries are dlaf|slate
 # rpn: ranks per node
 #
diff --git a/scripts/plot_trmm_strong.py b/scripts/plot_trmm_strong.py
new file mode 100755
index 0000000000..8df58b05a7
--- /dev/null
+++ b/scripts/plot_trmm_strong.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+# coding: utf-8
+
+#
+# Distributed Linear Algebra with Future (DLAF)
+#
+# Copyright (c) 2018-2023, ETH Zurich
+# All rights reserved.
+#
+# Please, refer to the LICENSE file in the root directory.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+
+import postprocess as pp
+
+df = pp.parse_jobs_cmdargs(description="Plot trmm strong scaling benchmarks.")
+
+df_grp = pp.calc_trmm_metrics(df)
+pp.gen_trmm_plots_strong(df_grp)
+pp.gen_trmm_plots_strong(df_grp, combine_mb=True)
diff --git a/scripts/plot_trmm_weak.py b/scripts/plot_trmm_weak.py
new file mode 100755
index 0000000000..0d44009e66
--- /dev/null
+++ b/scripts/plot_trmm_weak.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+# coding: utf-8
+
+#
+# Distributed Linear Algebra with Future (DLAF)
+#
+# Copyright (c) 2018-2023, ETH Zurich
+# All rights reserved.
+#
+# Please, refer to the LICENSE file in the root directory.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+
+import postprocess as pp
+
+df = pp.parse_jobs_cmdargs(description="Plot trmm weak scaling benchmarks.")
+
+df_grp = pp.calc_trmm_metrics(df)
+pp.gen_trmm_plots_weak(df_grp, 1024, logx=True)
+pp.gen_trmm_plots_weak(df_grp, 1024, logx=True, combine_mb=True)
diff --git a/scripts/postprocess.py b/scripts/postprocess.py
index 473faf55d7..4a5fda51f2 100644
--- a/scripts/postprocess.py
+++ b/scripts/postprocess.py
@@ -49,7 +49,7 @@ def _gen_nodes_plot(
     """
     Args:
         plt_type:       ppn | time
-        plt_routine:    chol | hegst | red2band | band2trid | trid_evp | bt_band2trid | bt_red2band | trsm | evp | gevp It is used to filter data.
+        plt_routine:    chol | hegst | red2band | band2trid | trid_evp | bt_band2trid | bt_red2band | trsm | trmm | evp | gevp It is used to filter data.
         title:          title of the plot
         df:             the pandas.DataFrame with the data for the plot
         combine_mb:     bool indicates if different mb has to be included in the same plot
@@ -240,7 +240,7 @@ def _parse_line_based(fout, bench_name, nodes):
         # fail.
         alg_name = bench_name[0 : bench_name.find("_dlaf")]
 
-        if alg_name in ["chol", "hegst", "trsm"]:
+        if alg_name in ["chol", "hegst", "trsm", "trmm"]:
             pstr_res = "[{run_index:d}] {time:g}s {perf:g}GFlop/s{matrix_type:optional_text} ({matrix_rows:d}, {matrix_cols:d}) ({block_rows:d}, {block_cols:d}) ({grid_rows:d}, {grid_cols:d}) {:d}{backend:optional_text}"
         if alg_name in ["red2band", "band2trid", "bt_band2trid", "bt_red2band"]:
             pstr_res = "[{run_index:d}] {time:g}s {perf:g}GFlop/s{matrix_type:optional_text} ({matrix_rows:d}, {matrix_cols:d}) ({block_rows:d}, {block_cols:d}) {band:d} ({grid_rows:d}, {grid_cols:d}) {:d}{backend:optional_text}"
@@ -435,6 +435,10 @@ def calc_trsm_metrics(df):
     return _calc_metrics(["matrix_rows", "matrix_cols", "block_rows", "nodes", "bench_name"], df)
 
 
+def calc_trmm_metrics(df):
+    return _calc_metrics(["matrix_rows", "matrix_cols", "block_rows", "nodes", "bench_name"], df)
+
+
 def calc_gen2std_metrics(df):
     return _calc_metrics(["matrix_rows", "block_rows", "nodes", "bench_name"], df)
 
@@ -500,7 +504,7 @@ def _gen_plot(
     Args:
         scaling         strong | weak
         name:           name of the routine to be included in the title
-        routine:        chol | hegst | red2band | band2trid | trid_evp | bt_band2trid | trsm | evp | gevp
+        routine:        chol | hegst | red2band | band2trid | trid_evp | bt_band2trid | trsm | trmm | evp | gevp
         combine_mb:     bool indicates if different mb has to be included in the same plot
         size_type:      m | mn It indicates which sizes are relevant.
         customize_ppn:  function accepting the two arguments fig and ax for ppn plot customization
@@ -779,6 +783,74 @@ def gen_trsm_plots_weak(
     )
 
 
+def gen_trmm_plots_strong(
+    df,
+    logx=False,
+    combine_mb=False,
+    filename_suffix=None,
+    customize_ppn=add_basic_legend,
+    customize_time=add_basic_legend,
+    **proxy_args,
+):
+    """
+    Args:
+        customize_ppn:  function accepting the two arguments fig and ax for ppn plot customization
+        customize_time: function accepting the two arguments fig and ax for time plot customization
+        Default customization (ppn and time): add_basic_legend. They can be set to "None" to remove the legend.
+    """
+    _gen_plot(
+        scaling="strong",
+        name="TRMM",
+        routine="trmm",
+        filename="trmm",
+        size_type="mn",
+        df=df,
+        logx=logx,
+        combine_mb=combine_mb,
+        filename_suffix=filename_suffix,
+        ppn_plot=True,
+        customize_ppn=customize_ppn,
+        time_plot=True,
+        customize_time=customize_time,
+        **proxy_args,
+    )
+
+
+def gen_trmm_plots_weak(
+    df,
+    weak_rt_approx,
+    logx=False,
+    combine_mb=False,
+    filename_suffix=None,
+    customize_ppn=add_basic_legend,
+    customize_time=add_basic_legend,
+    **proxy_args,
+):
+    """
+    Args:
+        customize_ppn:  function accepting the two arguments fig and ax for ppn plot customization
+        customize_time: function accepting the two arguments fig and ax for time plot customization
+        Default customization (ppn and time): add_basic_legend. They can be set to "None" to remove the legend.
+    """
+    _gen_plot(
+        scaling="weak",
+        name="TRMM",
+        routine="trmm",
+        filename="trmm",
+        size_type="mn",
+        df=df,
+        logx=logx,
+        combine_mb=combine_mb,
+        filename_suffix=filename_suffix,
+        ppn_plot=True,
+        customize_ppn=customize_ppn,
+        time_plot=True,
+        customize_time=customize_time,
+        weak_rt_approx=weak_rt_approx,
+        **proxy_args,
+    )
+
+
 def gen_gen2std_plots_strong(
     df,
     logx=None,

From e0b8b9ee83112edd3ccdf19260ce9bb1db2b668c Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Mon, 25 Sep 2023 10:58:15 +0200
Subject: [PATCH 3/7] merge-squashed trisolver dist change to reduce gemm cost

---
 .../dlaf/eigensolver/tridiag_solver/merge.h   | 143 +++++++++++++-----
 1 file changed, 105 insertions(+), 38 deletions(-)

diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h
index e61c74f7c5..079aef8423 100644
--- a/include/dlaf/eigensolver/tridiag_solver/merge.h
+++ b/include/dlaf/eigensolver/tridiag_solver/merge.h
@@ -500,6 +500,35 @@ auto stablePartitionIndexForDeflationArrays(const matrix::Distribution& dist_sub
     index_sorted_coltype[to_sizet(jjj_el)] = jj_el;
   }
 
+  // TODO manage edge cases
+  std::array<SizeType, 3> n_udl = [&]() {
+    SizeType first_dense;
+    for (first_dense = 0; first_dense < n; ++first_dense) {
+      const SizeType initial_el = index_sorted_coltype[to_sizet(first_dense)];
+      const ColType coltype = types[to_sizet(initial_el)];
+      if (ColType::UpperHalf != coltype)
+        break;
+    }
+
+    SizeType last_dense;
+    for (last_dense = n - 1; last_dense >= 0; --last_dense) {
+      const SizeType initial_el = index_sorted_coltype[to_sizet(last_dense)];
+      const ColType coltype = types[to_sizet(initial_el)];
+      if (ColType::LowerHalf != coltype && ColType::Deflated != coltype)
+        break;
+    }
+
+    SizeType last_lower;
+    for (last_lower = n - 1; last_lower >= 0; --last_lower) {
+      const SizeType initial_el = index_sorted_coltype[to_sizet(last_lower)];
+      const ColType coltype = types[to_sizet(initial_el)];
+      if (ColType::Deflated != coltype)
+        break;
+    }
+
+    return std::array<SizeType, 3>{first_dense, last_dense + 1, last_lower + 1};
+  }();
+
   // invert i3 and store it in i2 (temporary)
   //    i3 (in)  : initial  <--- deflated
   //    i2 (out) : deflated <--- initial
@@ -535,7 +564,7 @@ auto stablePartitionIndexForDeflationArrays(const matrix::Distribution& dist_sub
   for (SizeType i = 0; i < n; ++i)
     i2[i6[i]] = i;
 
-  return std::tuple(k, k_lc);
+  return std::tuple(k, k_lc, n_udl);
 }
 
 template <class T>
@@ -1308,33 +1337,14 @@ void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const S
                  const SizeType* i2 = i2_tiles_arr[0].get().ptr();
                  const SizeType* i6 = i6_tiles_arr[0].get().ptr();
 
-                 // STEP 0a: Fill ones for deflated Eigenvectors and copy related Eigenvalues (single-thread)
-                 // Note: this step is completely independent from the rest, but it is small and it is going
-                 // to be dropped soon.
+                 // STEP 0a: Permute eigenvalues for deflated eigenvectors (single-thread)
                  // Note: use last threads that in principle should have less work to do
                  if (k < n && thread_idx == nthreads - 1) {
                    const T* eval_initial_ptr = d_tiles[0].get().ptr();
                    T* eval_ptr = eval_tiles[0].ptr();
 
                    for (SizeType j_el_lc = k_lc; j_el_lc < n_el_lc; ++j_el_lc) {
-                     const SizeType j_el =
-                         dist_sub.global_element_from_local_element<Coord::Col>(j_el_lc);
-                     const SizeType i_el = j_el;
-
-                     if (dist_sub.rank_index().row() == dist_sub.rank_global_element<Coord::Row>(i_el)) {
-                       const SizeType i_el_lc =
-                           dist_sub.local_element_from_global_element<Coord::Row>(i_el);
-                       const LocalTileIndex
-                           i_lc{dist_sub.local_tile_from_local_element<Coord::Row>(i_el_lc),
-                                dist_sub.local_tile_from_local_element<Coord::Col>(j_el_lc)};
-                       const SizeType linear_lc = dist_extra::local_tile_linear_index(dist_sub, i_lc);
-                       const TileElementIndex
-                           ij_el_tl{dist_sub.tile_element_from_local_element<Coord::Row>(i_el_lc),
-                                    dist_sub.tile_element_from_local_element<Coord::Col>(j_el_lc)};
-
-                       evec_tiles[to_sizet(linear_lc)](ij_el_tl) = T{1};
-                     }
-
+                     const SizeType j_el = dist_sub.globalElementFromLocalElement<Coord::Col>(j_el_lc);
                      eval_ptr[j_el] = eval_initial_ptr[i6[j_el]];
                    }
                  }
@@ -1650,19 +1660,31 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid,
                           WorkSpace<T, D>& ws, WorkSpaceHost<T>& ws_h,
                           DistWorkSpaceHostMirror<T, D>& ws_hm) {
   namespace ex = pika::execution::experimental;
+  using matrix::internal::distribution::global_tile_element_distance;
   using pika::execution::thread_priority;
 
-  const matrix::Distribution& dist_evecs = ws.e0.distribution();
+  const matrix::Distribution& dist = ws.e0.distribution();
+
+  const GlobalElementIndex sub_offset{i_begin * dist.blockSize().rows(),
+                                      i_begin * dist.blockSize().cols()};
+  const matrix::Distribution dist_sub(
+      dist, {sub_offset,
+             {
+                 global_tile_element_distance<Coord::Row>(dist, i_begin, i_end),
+                 global_tile_element_distance<Coord::Col>(dist, i_begin, i_end),
+             }});
 
   // Calculate the size of the upper subproblem
-  const SizeType n1 = dist_evecs.globalTileElementDistance<Coord::Row>(i_begin, i_split);
+  const SizeType n = dist.globalTileElementDistance<Coord::Row>(i_begin, i_end);
+  const SizeType n_upper = dist.globalTileElementDistance<Coord::Row>(i_begin, i_split);
+  const SizeType n_lower = dist.globalTileElementDistance<Coord::Row>(i_split, i_end);
 
   // The local size of the subproblem
   const GlobalTileIndex idx_gl_begin(i_begin, i_begin);
-  const LocalTileIndex idx_loc_begin{dist_evecs.nextLocalTileFromGlobalTile<Coord::Row>(i_begin),
-                                     dist_evecs.nextLocalTileFromGlobalTile<Coord::Col>(i_begin)};
-  const LocalTileIndex idx_loc_end{dist_evecs.nextLocalTileFromGlobalTile<Coord::Row>(i_end),
-                                   dist_evecs.nextLocalTileFromGlobalTile<Coord::Col>(i_end)};
+  const LocalTileIndex idx_loc_begin{dist.nextLocalTileFromGlobalTile<Coord::Row>(i_begin),
+                                     dist.nextLocalTileFromGlobalTile<Coord::Col>(i_begin)};
+  const LocalTileIndex idx_loc_end{dist.nextLocalTileFromGlobalTile<Coord::Row>(i_end),
+                                   dist.nextLocalTileFromGlobalTile<Coord::Col>(i_end)};
   const LocalTileSize sz_loc_tiles = idx_loc_end - idx_loc_begin;
   const LocalTileIndex idx_begin_tiles_vec(i_begin, 0);
   const LocalTileSize sz_tiles_vec(i_end - i_begin, 1);
@@ -1689,7 +1711,7 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid,
   }
 
   // Update indices of second sub-problem
-  addIndex(i_split, i_end, n1, ws_h.i1);
+  addIndex(i_split, i_end, n_upper, ws_h.i1);
 
   // Step #1
   //
@@ -1699,7 +1721,7 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid,
   // - deflate `d`, `z` and `c`
   // - apply Givens rotations to `Q` - `evecs`
   //
-  sortIndex(i_begin, i_end, ex::just(n1), ws_h.d0, ws_h.i1, ws_hm.i2);
+  sortIndex(i_begin, i_end, ex::just(n_upper), ws_h.d0, ws_h.i1, ws_hm.i2);
 
   auto rots =
       applyDeflation(i_begin, i_end, scaled_rho, std::move(tol), ws_hm.i2, ws_h.d0, ws_hm.z0, ws_h.c);
@@ -1725,11 +1747,12 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid,
   // - reorder `d0 -> d1`, `z0 -> z1`, using `i3` such that deflated entries are at the bottom.
   // - solve the rank-1 problem and save eigenvalues in `d0` and `d1` (copy) and eigenvectors in `e2`.
   // - set deflated diagonal entries of `U` to 1 (temporary solution until optimized GEMM is implemented)
-  auto [k_unique, k_lc] =
-      ex::split_tuple(stablePartitionIndexForDeflation(dist_evecs, i_begin, i_end, ws_h.c, ws_h.d0,
-                                                       ws_hm.i2, ws_h.i3, ws_hm.i5, ws_h.i4, ws_hm.i6));
+  auto [k_unique, k_lc_unique, n_udl] =
+      ex::split_tuple(stablePartitionIndexForDeflation(dist, i_begin, i_end, ws_h.c, ws_h.d0, ws_hm.i2,
+                                                       ws_h.i3, ws_hm.i5, ws_h.i4, ws_hm.i6));
 
   auto k = ex::split(std::move(k_unique));
+  auto k_lc = ex::split(std::move(k_lc_unique));
 
   // Reorder Eigenvectors
   using dlaf::permutations::internal::permuteJustLocal;
@@ -1749,18 +1772,62 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid,
 
   // Note: here ws_hm.z0 is used as a contiguous buffer for the laed4 call
   matrix::util::set0<Backend::MC>(thread_priority::normal, idx_loc_begin, sz_loc_tiles, ws_hm.e2);
-  solveRank1ProblemDist(row_task_chain(), col_task_chain(), i_begin, i_end, k, std::move(k_lc),
+  solveRank1ProblemDist(row_task_chain(), col_task_chain(), i_begin, i_end, k, k_lc,
                         std::move(scaled_rho), ws_hm.d1, ws_hm.z1, ws_h.d0, ws_h.i4, ws_hm.i6, ws_hm.i2,
                         ws_hm.e2);
+  copy(idx_loc_begin, sz_loc_tiles, ws_hm.e2, ws.e2);
 
   // Step #3: Eigenvectors of the tridiagonal system: Q * U
   //
   // The eigenvectors resulting from the multiplication are already in the order of the eigenvalues as
   // prepared for the deflated system.
-  copy(idx_loc_begin, sz_loc_tiles, ws_hm.e2, ws.e2);
-  dlaf::multiplication::internal::generalSubMatrix<B, D, T>(grid, row_task_chain, col_task_chain,
-                                                            i_begin, i_end, T(1), ws.e1, ws.e2, T(0),
-                                                            ws.e0);
+  ex::start_detached(
+      ex::when_all(std::move(k_lc), std::move(n_udl), row_task_chain(), col_task_chain()) |
+      ex::transfer(dlaf::internal::getBackendScheduler<Backend::MC>()) |
+      ex::then([dist_sub, sub_offset, n, n_upper, n_lower, e0 = ws.e0.subPipeline(),
+                e1 = ws.e1.subPipelineConst(), e2 = ws.e2.subPipelineConst()](
+                   const SizeType k_lc, const std::array<SizeType, 3>& n_udl, auto&& row_comm_wrapper,
+                   auto&& col_comm_wrapper) mutable {
+        using dlaf::matrix::internal::MatrixRef;
+
+        common::Pipeline<comm::Communicator> sub_comm_row(row_comm_wrapper.get());
+        common::Pipeline<comm::Communicator> sub_comm_col(col_comm_wrapper.get());
+
+        const auto [a, b, c] = n_udl;
+
+        using GEMM = dlaf::multiplication::internal::General<B, D, T>;
+        {
+          MatrixRef<const T, D> e1_sub(e1, {sub_offset, {n_upper, b}});
+          MatrixRef<const T, D> e2_sub(e2, {sub_offset, {b, c}});
+          MatrixRef<T, D> e0_sub(e0, {sub_offset, {n_upper, c}});
+
+          GEMM::callNN(sub_comm_row, sub_comm_col, T(1), e1_sub, e2_sub, T(0), e0_sub);
+        }
+
+        {
+          MatrixRef<const T, D> e1_sub(e1, {{sub_offset.row() + n_upper, sub_offset.col() + a},
+                                            {n_lower, c - a}});
+          MatrixRef<const T, D> e2_sub(e2, {{sub_offset.row() + a, sub_offset.col()}, {c - a, c}});
+          MatrixRef<T, D> e0_sub(e0, {{sub_offset.row() + n_upper, sub_offset.col()}, {n_lower, c}});
+
+          GEMM::callNN(sub_comm_row, sub_comm_col, T(1), e1_sub, e2_sub, T(0), e0_sub);
+        }
+
+        // copy deflated from e1 to e0
+        if (k_lc < dist_sub.localSize().cols()) {
+          const SizeType k = dist_sub.globalElementFromLocalElement<Coord::Col>(k_lc);
+          const matrix::internal::SubMatrixSpec deflated_submat{{sub_offset.row(), sub_offset.col() + k},
+                                                                {n, n - k}};
+          MatrixRef<T, D> sub_e0(e0, deflated_submat);
+          MatrixRef<const T, D> sub_e1(e1, deflated_submat);
+
+          copy(sub_e1, sub_e0);
+        }
+
+        namespace tt = pika::this_thread::experimental;
+        tt::sync_wait(sub_comm_row());
+        tt::sync_wait(sub_comm_col());
+      }));
 
   // Step #4: Final permutation to sort eigenvalues and eigenvectors
   //

From 4f3150742468f23a9b9e50bb210a323fa312100d Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Tue, 12 Dec 2023 12:13:59 +0100
Subject: [PATCH 4/7] start factoring out gemm

---
 .../dlaf/eigensolver/tridiag_solver/merge.h   | 107 ++++++++++--------
 1 file changed, 60 insertions(+), 47 deletions(-)

diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h
index 079aef8423..20601ac871 100644
--- a/include/dlaf/eigensolver/tridiag_solver/merge.h
+++ b/include/dlaf/eigensolver/tridiag_solver/merge.h
@@ -1650,6 +1650,64 @@ void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const S
       }));
 }
 
+template <Backend B, class T, Device D, class KLcSender, class UDLSenders>
+void multiplyEigenvectors(const matrix::Distribution& dist_sub,
+                          common::Pipeline<comm::Communicator>& row_task_chain,
+                          common::Pipeline<comm::Communicator>& col_task_chain,
+                          const GlobalElementIndex sub_offset, const SizeType n, const SizeType n_upper,
+                          const SizeType n_lower, Matrix<T, D>& e0, Matrix<T, D>& e1, Matrix<T, D>& e2,
+                          KLcSender&& k_lc, UDLSenders&& n_udl) {
+  namespace ex = pika::execution::experimental;
+
+  ex::start_detached(
+      ex::when_all(std::forward<KLcSender>(k_lc), std::forward<UDLSenders>(n_udl), row_task_chain(),
+                   col_task_chain()) |
+      ex::transfer(dlaf::internal::getBackendScheduler<Backend::MC>()) |
+      ex::then([dist_sub, sub_offset, n, n_upper, n_lower, e0 = e0.subPipeline(),
+                e1 = e1.subPipelineConst(),
+                e2 = e2.subPipelineConst()](const SizeType k_lc, const std::array<SizeType, 3>& n_udl,
+                                            auto&& row_comm_wrapper, auto&& col_comm_wrapper) mutable {
+        using dlaf::matrix::internal::MatrixRef;
+
+        common::Pipeline<comm::Communicator> sub_comm_row(row_comm_wrapper.get());
+        common::Pipeline<comm::Communicator> sub_comm_col(col_comm_wrapper.get());
+
+        const auto [a, b, c] = n_udl;
+
+        using GEMM = dlaf::multiplication::internal::General<B, D, T>;
+        {
+          MatrixRef<const T, D> e1_sub(e1, {sub_offset, {n_upper, b}});
+          MatrixRef<const T, D> e2_sub(e2, {sub_offset, {b, c}});
+          MatrixRef<T, D> e0_sub(e0, {sub_offset, {n_upper, c}});
+
+          GEMM::callNN(sub_comm_row, sub_comm_col, T(1), e1_sub, e2_sub, T(0), e0_sub);
+        }
+
+        {
+          MatrixRef<const T, D> e1_sub(e1, {{sub_offset.row() + n_upper, sub_offset.col() + a},
+                                            {n_lower, c - a}});
+          MatrixRef<const T, D> e2_sub(e2, {{sub_offset.row() + a, sub_offset.col()}, {c - a, c}});
+          MatrixRef<T, D> e0_sub(e0, {{sub_offset.row() + n_upper, sub_offset.col()}, {n_lower, c}});
+
+          GEMM::callNN(sub_comm_row, sub_comm_col, T(1), e1_sub, e2_sub, T(0), e0_sub);
+        }
+
+        if (k_lc < dist_sub.localSize().cols()) {
+          const SizeType k = dist_sub.globalElementFromLocalElement<Coord::Col>(k_lc);
+          const matrix::internal::SubMatrixSpec deflated_submat{{sub_offset.row(), sub_offset.col() + k},
+                                                                {n, n - k}};
+          MatrixRef<T, D> sub_e0(e0, deflated_submat);
+          MatrixRef<const T, D> sub_e1(e1, deflated_submat);
+
+          copy(sub_e1, sub_e0);
+        }
+
+        namespace tt = pika::this_thread::experimental;
+        tt::sync_wait(sub_comm_row());
+        tt::sync_wait(sub_comm_col());
+      }));
+}
+
 // Distributed version of the tridiagonal solver on CPUs
 template <Backend B, class T, Device D, class RhoSender>
 void mergeDistSubproblems(comm::CommunicatorGrid grid,
@@ -1781,53 +1839,8 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid,
   //
   // The eigenvectors resulting from the multiplication are already in the order of the eigenvalues as
   // prepared for the deflated system.
-  ex::start_detached(
-      ex::when_all(std::move(k_lc), std::move(n_udl), row_task_chain(), col_task_chain()) |
-      ex::transfer(dlaf::internal::getBackendScheduler<Backend::MC>()) |
-      ex::then([dist_sub, sub_offset, n, n_upper, n_lower, e0 = ws.e0.subPipeline(),
-                e1 = ws.e1.subPipelineConst(), e2 = ws.e2.subPipelineConst()](
-                   const SizeType k_lc, const std::array<SizeType, 3>& n_udl, auto&& row_comm_wrapper,
-                   auto&& col_comm_wrapper) mutable {
-        using dlaf::matrix::internal::MatrixRef;
-
-        common::Pipeline<comm::Communicator> sub_comm_row(row_comm_wrapper.get());
-        common::Pipeline<comm::Communicator> sub_comm_col(col_comm_wrapper.get());
-
-        const auto [a, b, c] = n_udl;
-
-        using GEMM = dlaf::multiplication::internal::General<B, D, T>;
-        {
-          MatrixRef<const T, D> e1_sub(e1, {sub_offset, {n_upper, b}});
-          MatrixRef<const T, D> e2_sub(e2, {sub_offset, {b, c}});
-          MatrixRef<T, D> e0_sub(e0, {sub_offset, {n_upper, c}});
-
-          GEMM::callNN(sub_comm_row, sub_comm_col, T(1), e1_sub, e2_sub, T(0), e0_sub);
-        }
-
-        {
-          MatrixRef<const T, D> e1_sub(e1, {{sub_offset.row() + n_upper, sub_offset.col() + a},
-                                            {n_lower, c - a}});
-          MatrixRef<const T, D> e2_sub(e2, {{sub_offset.row() + a, sub_offset.col()}, {c - a, c}});
-          MatrixRef<T, D> e0_sub(e0, {{sub_offset.row() + n_upper, sub_offset.col()}, {n_lower, c}});
-
-          GEMM::callNN(sub_comm_row, sub_comm_col, T(1), e1_sub, e2_sub, T(0), e0_sub);
-        }
-
-        // copy deflated from e1 to e0
-        if (k_lc < dist_sub.localSize().cols()) {
-          const SizeType k = dist_sub.globalElementFromLocalElement<Coord::Col>(k_lc);
-          const matrix::internal::SubMatrixSpec deflated_submat{{sub_offset.row(), sub_offset.col() + k},
-                                                                {n, n - k}};
-          MatrixRef<T, D> sub_e0(e0, deflated_submat);
-          MatrixRef<const T, D> sub_e1(e1, deflated_submat);
-
-          copy(sub_e1, sub_e0);
-        }
-
-        namespace tt = pika::this_thread::experimental;
-        tt::sync_wait(sub_comm_row());
-        tt::sync_wait(sub_comm_col());
-      }));
+  multiplyEigenvectors<B>(dist_sub, row_task_chain, col_task_chain, sub_offset, n, n_upper, n_lower,
+                          ws.e0, ws.e1, ws.e2, std::move(k_lc), std::move(n_udl));
 
   // Step #4: Final permutation to sort eigenvalues and eigenvectors
   //

From a824d404bcc43a67f9cba99ce3b8ec45195c88e1 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Tue, 12 Dec 2023 16:33:18 +0100
Subject: [PATCH 5/7] doc + minor changes

---
 .../dlaf/eigensolver/tridiag_solver/merge.h   | 88 +++++++++++++++++--
 1 file changed, 79 insertions(+), 9 deletions(-)

diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h
index 20601ac871..d7a3566f6c 100644
--- a/include/dlaf/eigensolver/tridiag_solver/merge.h
+++ b/include/dlaf/eigensolver/tridiag_solver/merge.h
@@ -412,6 +412,7 @@ auto stablePartitionIndexForDeflationArrays(const SizeType n, const ColType* typ
 //
 // @return k                    number of non-deflated eigenvectors
 // @return k_local              number of local non-deflated eigenvectors
+// @return n_udl                tuple with global indices for [first_dense, last_dense, last_lower]
 template <class T>
 auto stablePartitionIndexForDeflationArrays(const matrix::Distribution& dist_sub, const ColType* types,
                                             const T* evals, SizeType* perm_sorted,
@@ -500,7 +501,6 @@ auto stablePartitionIndexForDeflationArrays(const matrix::Distribution& dist_sub
     index_sorted_coltype[to_sizet(jjj_el)] = jj_el;
   }
 
-  // TODO manage edge cases
   std::array<SizeType, 3> n_udl = [&]() {
     SizeType first_dense;
     for (first_dense = 0; first_dense < n; ++first_dense) {
@@ -510,14 +510,11 @@ auto stablePartitionIndexForDeflationArrays(const matrix::Distribution& dist_sub
         break;
     }
 
-    SizeType last_dense;
-    for (last_dense = n - 1; last_dense >= 0; --last_dense) {
-      const SizeType initial_el = index_sorted_coltype[to_sizet(last_dense)];
-      const ColType coltype = types[to_sizet(initial_el)];
-      if (ColType::LowerHalf != coltype && ColType::Deflated != coltype)
-        break;
-    }
-
+    // Note:
+    // Eigenvectors will be sorted according index_sorted_coltype, i.e. local sort by coltype.
+    // Since it is a local order, it is legit if deflated are globally interlaced with other column
+    // types. However, GEMM will be able to skip just the last global contiguous group of deflated
+    // eigenvectors, but not the ones interlaced with others.
     SizeType last_lower;
     for (last_lower = n - 1; last_lower >= 0; --last_lower) {
       const SizeType initial_el = index_sorted_coltype[to_sizet(last_lower)];
@@ -526,6 +523,14 @@ auto stablePartitionIndexForDeflationArrays(const matrix::Distribution& dist_sub
         break;
     }
 
+    SizeType last_dense;
+    for (last_dense = last_lower; last_dense >= 0; --last_dense) {
+      const SizeType initial_el = index_sorted_coltype[to_sizet(last_dense)];
+      const ColType coltype = types[to_sizet(initial_el)];
+      if (ColType::LowerHalf != coltype && ColType::Deflated != coltype)
+        break;
+    }
+
     return std::array<SizeType, 3>{first_dense, last_dense + 1, last_lower + 1};
   }();
 
@@ -1657,6 +1662,71 @@ void multiplyEigenvectors(const matrix::Distribution& dist_sub,
                           const GlobalElementIndex sub_offset, const SizeType n, const SizeType n_upper,
                           const SizeType n_lower, Matrix<T, D>& e0, Matrix<T, D>& e1, Matrix<T, D>& e2,
                           KLcSender&& k_lc, UDLSenders&& n_udl) {
+  // Note:
+  // This function computes E0 = E1 . E2
+  //
+  // where E1 is the matrix with eigenvectors and it looks like this
+  //
+  //               ┌──────────┐ k
+  //               │    b     │ │
+  //                            ▼
+  //          ┌──  ┌───┬──────┬─┬────┐
+  //          │    │UUU│DDDDDD│ │XXXX│
+  //          │    │UUU│DDDDDD│ │XXXX│
+  //  n_upper │    │UUU│DDDDDD│ │XXXX│
+  //          │    │UUU│DDDDDD│ │XXXX│
+  //          │    │UUU│DDDDDD│ │XXXX│
+  //          ├──  ├───┼──────┼─┤XXXX│
+  //          │    │   │DDDDDD│L│XXXX│
+  //  n_lower │    │   │DDDDDD│L│XXXX│
+  //          │    │   │DDDDDD│L│XXXX│
+  //          └──  └───┴──────┴─┴────┘
+  //               │ a │
+  //               └───┘
+  //               │      c     │
+  //               └────────────┘
+  //
+  // Where (a, b, c) are the values from n_udl
+  //
+  // Note:
+  // E1 matrix does not have all deflated values at the end, indeed part of them are "interlaced" with
+  // others. The GEMM will perform anyway a computation for deflated eigenvectors (which are zeroed out)
+  // while the copy step will be performed at "local" level, so even interlaced ones will get copied
+  // in the right spot.
+  //
+  // The multiplication in two different steps in order to skip zero blocks of the matrix, created by
+  // the grouping of eigenvectors of different lengths (UPPER, DENSE and LOWER).
+  //
+  // 1. GEMM1 = TL . TOP
+  // 2. GEMM2 = BR . BOTTOM
+  // 3. copy DEFLATED
+  //
+  //                      ┌────────────┬────┐
+  //                      │            │    │
+  //                      │            │    │
+  //                      │   T O P    │    │
+  //                      │            │    │
+  //                      │            │    │
+  //                      ├────────────┤    │
+  //                      │            │    │
+  //                      │            │    │
+  //                      │B O T T O M │    │
+  //                      │            │    │
+  //                      └────────────┴────┘
+  //
+  // ┌──────────┬─┬────┐  ┌────────────┬────┐
+  // │          │0│    │  │            │    │
+  // │          │0│ D  │  │            │    │
+  // │   TL     │0│ E  │  │  GEMM 1    │ C  │
+  // │          │0│ F  │  │            │    │
+  // │          │0│ L  │  │            │ O  │
+  // ├───┬──────┴─┤ A  │  ├────────────┤    │
+  // │000│        │ T  │  │            │ P  │
+  // │000│        │ E  │  │            │    │
+  // │000│  BR    │ D  │  │  GEMM 2    │ Y  │
+  // │000│        │    │  │            │    │
+  // └───┴────────┴────┘  └────────────┴────┘
+
   namespace ex = pika::execution::experimental;
 
   ex::start_detached(

From c42260279613a70611dc1fc44c785b5f2908fad7 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Tue, 12 Dec 2023 16:54:05 +0100
Subject: [PATCH 6/7] make gemm scheduler hp + fix doc + minor changes

---
 .../dlaf/eigensolver/tridiag_solver/merge.h   | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h
index d7a3566f6c..be8cdc81b3 100644
--- a/include/dlaf/eigensolver/tridiag_solver/merge.h
+++ b/include/dlaf/eigensolver/tridiag_solver/merge.h
@@ -1656,10 +1656,9 @@ void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const S
 }
 
 template <Backend B, class T, Device D, class KLcSender, class UDLSenders>
-void multiplyEigenvectors(const matrix::Distribution& dist_sub,
+void multiplyEigenvectors(const GlobalElementIndex sub_offset, const matrix::Distribution& dist_sub,
                           common::Pipeline<comm::Communicator>& row_task_chain,
-                          common::Pipeline<comm::Communicator>& col_task_chain,
-                          const GlobalElementIndex sub_offset, const SizeType n, const SizeType n_upper,
+                          common::Pipeline<comm::Communicator>& col_task_chain, const SizeType n_upper,
                           const SizeType n_lower, Matrix<T, D>& e0, Matrix<T, D>& e1, Matrix<T, D>& e2,
                           KLcSender&& k_lc, UDLSenders&& n_udl) {
   // Note:
@@ -1728,12 +1727,13 @@ void multiplyEigenvectors(const matrix::Distribution& dist_sub,
   // └───┴────────┴────┘  └────────────┴────┘
 
   namespace ex = pika::execution::experimental;
+  using pika::execution::thread_priority;
 
   ex::start_detached(
       ex::when_all(std::forward<KLcSender>(k_lc), std::forward<UDLSenders>(n_udl), row_task_chain(),
                    col_task_chain()) |
-      ex::transfer(dlaf::internal::getBackendScheduler<Backend::MC>()) |
-      ex::then([dist_sub, sub_offset, n, n_upper, n_lower, e0 = e0.subPipeline(),
+      ex::transfer(dlaf::internal::getBackendScheduler<Backend::MC>(thread_priority::high)) |
+      ex::then([dist_sub, sub_offset, n_upper, n_lower, e0 = e0.subPipeline(),
                 e1 = e1.subPipelineConst(),
                 e2 = e2.subPipelineConst()](const SizeType k_lc, const std::array<SizeType, 3>& n_udl,
                                             auto&& row_comm_wrapper, auto&& col_comm_wrapper) mutable {
@@ -1742,6 +1742,7 @@ void multiplyEigenvectors(const matrix::Distribution& dist_sub,
         common::Pipeline<comm::Communicator> sub_comm_row(row_comm_wrapper.get());
         common::Pipeline<comm::Communicator> sub_comm_col(col_comm_wrapper.get());
 
+        const SizeType n = dist_sub.size().cols();
         const auto [a, b, c] = n_udl;
 
         using GEMM = dlaf::multiplication::internal::General<B, D, T>;
@@ -1793,8 +1794,8 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid,
 
   const matrix::Distribution& dist = ws.e0.distribution();
 
-  const GlobalElementIndex sub_offset{i_begin * dist.blockSize().rows(),
-                                      i_begin * dist.blockSize().cols()};
+  const GlobalElementIndex sub_offset{i_begin * dist.tile_size().rows(),
+                                      i_begin * dist.tile_size().cols()};
   const matrix::Distribution dist_sub(
       dist, {sub_offset,
              {
@@ -1898,7 +1899,9 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid,
   applyIndex(i_begin, i_end, ws_h.i3, ws_h.d0, ws_hm.d1);
   applyIndex(i_begin, i_end, ws_h.i3, ws_hm.z0, ws_hm.z1);
 
-  // Note: here ws_hm.z0 is used as a contiguous buffer for the laed4 call
+  // Note:
+  // set0 is required because deflated eigenvectors rows won't be touched in rank1 and so they will be
+  // neutral when used in GEMM (copy will take care of them later)
   matrix::util::set0<Backend::MC>(thread_priority::normal, idx_loc_begin, sz_loc_tiles, ws_hm.e2);
   solveRank1ProblemDist(row_task_chain(), col_task_chain(), i_begin, i_end, k, k_lc,
                         std::move(scaled_rho), ws_hm.d1, ws_hm.z1, ws_h.d0, ws_h.i4, ws_hm.i6, ws_hm.i2,
@@ -1909,8 +1912,8 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid,
   //
   // The eigenvectors resulting from the multiplication are already in the order of the eigenvalues as
   // prepared for the deflated system.
-  multiplyEigenvectors<B>(dist_sub, row_task_chain, col_task_chain, sub_offset, n, n_upper, n_lower,
-                          ws.e0, ws.e1, ws.e2, std::move(k_lc), std::move(n_udl));
+  multiplyEigenvectors<B>(sub_offset, dist_sub, row_task_chain, col_task_chain, n_upper, n_lower, ws.e0,
+                          ws.e1, ws.e2, std::move(k_lc), std::move(n_udl));
 
   // Step #4: Final permutation to sort eigenvalues and eigenvectors
   //

From 4eb08c66efcde84fe5db701109a683c9c6e8b172 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Tue, 12 Dec 2023 17:03:28 +0100
Subject: [PATCH 7/7] snake case

---
 include/dlaf/eigensolver/tridiag_solver/merge.h | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h
index be8cdc81b3..0a535c147a 100644
--- a/include/dlaf/eigensolver/tridiag_solver/merge.h
+++ b/include/dlaf/eigensolver/tridiag_solver/merge.h
@@ -1763,8 +1763,8 @@ void multiplyEigenvectors(const GlobalElementIndex sub_offset, const matrix::Dis
           GEMM::callNN(sub_comm_row, sub_comm_col, T(1), e1_sub, e2_sub, T(0), e0_sub);
         }
 
-        if (k_lc < dist_sub.localSize().cols()) {
-          const SizeType k = dist_sub.globalElementFromLocalElement<Coord::Col>(k_lc);
+        if (k_lc < dist_sub.local_size().cols()) {
+          const SizeType k = dist_sub.global_element_from_local_element<Coord::Col>(k_lc);
           const matrix::internal::SubMatrixSpec deflated_submat{{sub_offset.row(), sub_offset.col() + k},
                                                                 {n, n - k}};
           MatrixRef<T, D> sub_e0(e0, deflated_submat);
@@ -1804,16 +1804,15 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid,
              }});
 
   // Calculate the size of the upper subproblem
-  const SizeType n = dist.globalTileElementDistance<Coord::Row>(i_begin, i_end);
-  const SizeType n_upper = dist.globalTileElementDistance<Coord::Row>(i_begin, i_split);
-  const SizeType n_lower = dist.globalTileElementDistance<Coord::Row>(i_split, i_end);
+  const SizeType n_upper = global_tile_element_distance<Coord::Row>(dist, i_begin, i_split);
+  const SizeType n_lower = global_tile_element_distance<Coord::Row>(dist, i_split, i_end);
 
   // The local size of the subproblem
   const GlobalTileIndex idx_gl_begin(i_begin, i_begin);
-  const LocalTileIndex idx_loc_begin{dist.nextLocalTileFromGlobalTile<Coord::Row>(i_begin),
-                                     dist.nextLocalTileFromGlobalTile<Coord::Col>(i_begin)};
-  const LocalTileIndex idx_loc_end{dist.nextLocalTileFromGlobalTile<Coord::Row>(i_end),
-                                   dist.nextLocalTileFromGlobalTile<Coord::Col>(i_end)};
+  const LocalTileIndex idx_loc_begin{dist.next_local_tile_from_global_tile<Coord::Row>(i_begin),
+                                     dist.next_local_tile_from_global_tile<Coord::Col>(i_begin)};
+  const LocalTileIndex idx_loc_end{dist.next_local_tile_from_global_tile<Coord::Row>(i_end),
+                                   dist.next_local_tile_from_global_tile<Coord::Col>(i_end)};
   const LocalTileSize sz_loc_tiles = idx_loc_end - idx_loc_begin;
   const LocalTileIndex idx_begin_tiles_vec(i_begin, 0);
   const LocalTileSize sz_tiles_vec(i_end - i_begin, 1);